@@ -50,7 +50,6 @@ class BaseParser
5050
5151 DOCTYPE_START = /\A \s *<!DOCTYPE\s /um
5252 DOCTYPE_END = /\A \s *\] \s *>/um
53- DOCTYPE_PATTERN = /\s *<!DOCTYPE\s +(.*?)(\[ |>)/um
5453 ATTRIBUTE_PATTERN = /\s *(#{ QNAME_STR } )\s *=\s *(["'])(.*?)\4 /um
5554 COMMENT_START = /\A <!--/u
5655 COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -69,7 +68,6 @@ class BaseParser
6968 STANDALONE = /\b standalone\s *=\s *["'](.*?)['"]/um
7069
7170 ENTITY_START = /\A \s *<!ENTITY/
72- IDENTITY = /^([!\* \w \- ]+)(\s +#{ NCNAME_STR } )?(\s +["'](.*?)['"])?(\s +['"](.*?)["'])?/u
7371 ELEMENTDECL_START = /\A \s *<!ELEMENT/um
7472 ELEMENTDECL_PATTERN = /\A \s *(<!ELEMENT.*?)>/um
7573 SYSTEMENTITY = /\A \s *(%.*?;)\s *$/um
@@ -101,8 +99,9 @@ class BaseParser
10199 ENTITYDECL = /\s *(?:#{ GEDECL } )|(?:#{ PEDECL } )/um
102100
103101 NOTATIONDECL_START = /\A \s *<!NOTATION/um
104- PUBLIC = /\A \s *<!NOTATION\s +#{ NAME } \s +(PUBLIC)\s +#{ PUBIDLITERAL } (?:\s +#{ SYSTEMLITERAL } )?\s *>/um
105- SYSTEM = /\A \s *<!NOTATION\s +#{ NAME } \s +(SYSTEM)\s +#{ SYSTEMLITERAL } \s *>/um
102+ EXTERNAL_ID_PUBLIC = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s */um
103+ EXTERNAL_ID_SYSTEM = /\A \s *SYSTEM\s +#{ SYSTEMLITERAL } \s */um
104+ PUBLIC_ID = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s */um
106105
107106 EREFERENCE = /&(?!#{ NAME } ;)/
108107
@@ -225,24 +224,37 @@ def pull_event
225224 when INSTRUCTION_START
226225 return process_instruction
227226 when DOCTYPE_START
228- md = @source . match ( DOCTYPE_PATTERN , true )
227+ base_error_message = "Malformed DOCTYPE"
228+ @source . match ( DOCTYPE_START , true )
229229 @nsstack . unshift ( curr_ns = Set . new )
230- identity = md [ 1 ]
231- close = md [ 2 ]
232- identity =~ IDENTITY
233- name = $1
234- raise REXML ::ParseException . new ( "DOCTYPE is missing a name" ) if name . nil?
235- pub_sys = $2. nil? ? nil : $2. strip
236- long_name = $4. nil? ? nil : $4. strip
237- uri = $6. nil? ? nil : $6. strip
238- args = [ :start_doctype , name , pub_sys , long_name , uri ]
239- if close == ">"
230+ name = parse_name ( base_error_message )
231+ if @source . match ( /\A \s *\[ /um , true )
232+ id = [ nil , nil , nil ]
233+ @document_status = :in_doctype
234+ elsif @source . match ( /\A \s *>/um , true )
235+ id = [ nil , nil , nil ]
240236 @document_status = :after_doctype
241- @source . read if @source . buffer . size <2
242- md = @source . match ( /^\s */um , true )
243- @stack << [ :end_doctype ]
244237 else
245- @document_status = :in_doctype
238+ id = parse_id ( base_error_message ,
239+ accept_external_id : true ,
240+ accept_public_id : false )
241+ if id [ 0 ] == "SYSTEM"
242+ # For backward compatibility
243+ id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
244+ end
245+ if @source . match ( /\A \s *\[ /um , true )
246+ @document_status = :in_doctype
247+ elsif @source . match ( /\A \s *>/um , true )
248+ @document_status = :after_doctype
249+ else
250+ message = "#{ base_error_message } : garbage after external ID"
251+ raise REXML ::ParseException . new ( message , @source )
252+ end
253+ end
254+ args = [ :start_doctype , name , *id ]
255+ if @document_status == :after_doctype
256+ @source . match ( /\A \s */um , true )
257+ @stack << [ :end_doctype ]
246258 end
247259 return args
248260 when /^\s +/
@@ -313,27 +325,24 @@ def pull_event
313325 end
314326 return [ :attlistdecl , element , pairs , contents ]
315327 when NOTATIONDECL_START
316- md = nil
317- if @source . match ( PUBLIC )
318- md = @source . match ( PUBLIC , true )
319- pubid = system = nil
320- pubid_literal = md [ 3 ]
321- pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
322- system_literal = md [ 4 ]
323- system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
324- vals = [ md [ 1 ] , md [ 2 ] , pubid , system ]
325- elsif @source . match ( SYSTEM )
326- md = @source . match ( SYSTEM , true )
327- system = nil
328- system_literal = md [ 3 ]
329- system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
330- vals = [ md [ 1 ] , md [ 2 ] , nil , system ]
331- else
332- details = notation_decl_invalid_details
333- message = "Malformed notation declaration: #{ details } "
328+ base_error_message = "Malformed notation declaration"
329+ unless @source . match ( /\A \s *<!NOTATION\s +/um , true )
330+ if @source . match ( /\A \s *<!NOTATION\s *>/um )
331+ message = "#{ base_error_message } : name is missing"
332+ else
333+ message = "#{ base_error_message } : invalid declaration name"
334+ end
335+ raise REXML ::ParseException . new ( message , @source )
336+ end
337+ name = parse_name ( base_error_message )
338+ id = parse_id ( base_error_message ,
339+ accept_external_id : true ,
340+ accept_public_id : true )
341+ unless @source . match ( /\A \s *>/um , true )
342+ message = "#{ base_error_message } : garbage before end >"
334343 raise REXML ::ParseException . new ( message , @source )
335344 end
336- return [ :notationdecl , * vals ]
345+ return [ :notationdecl , name , * id ]
337346 when DOCTYPE_END
338347 @document_status = :after_doctype
339348 @source . match ( DOCTYPE_END , true )
@@ -488,6 +497,85 @@ def need_source_encoding_update?(xml_declaration_encoding)
488497 true
489498 end
490499
500+ def parse_name ( base_error_message )
501+ md = @source . match ( /\A \s *#{ NAME } /um , true )
502+ unless md
503+ if @source . match ( /\A \s *\S /um )
504+ message = "#{ base_error_message } : invalid name"
505+ else
506+ message = "#{ base_error_message } : name is missing"
507+ end
508+ raise REXML ::ParseException . new ( message , @source )
509+ end
510+ md [ 1 ]
511+ end
512+
513+ def parse_id ( base_error_message ,
514+ accept_external_id :,
515+ accept_public_id :)
516+ if accept_external_id and ( md = @source . match ( EXTERNAL_ID_PUBLIC , true ) )
517+ pubid = system = nil
518+ pubid_literal = md [ 1 ]
519+ pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
520+ system_literal = md [ 2 ]
521+ system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
522+ [ "PUBLIC" , pubid , system ]
523+ elsif accept_public_id and ( md = @source . match ( PUBLIC_ID , true ) )
524+ pubid = system = nil
525+ pubid_literal = md [ 1 ]
526+ pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
527+ [ "PUBLIC" , pubid , nil ]
528+ elsif accept_external_id and ( md = @source . match ( EXTERNAL_ID_SYSTEM , true ) )
529+ system = nil
530+ system_literal = md [ 1 ]
531+ system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
532+ [ "SYSTEM" , nil , system ]
533+ else
534+ details = parse_id_invalid_details ( accept_external_id : accept_external_id ,
535+ accept_public_id : accept_public_id )
536+ message = "#{ base_error_message } : #{ details } "
537+ raise REXML ::ParseException . new ( message , @source )
538+ end
539+ end
540+
541+ def parse_id_invalid_details ( accept_external_id :,
542+ accept_public_id :)
543+ public = /\A \s *PUBLIC/um
544+ system = /\A \s *SYSTEM/um
545+ if ( accept_external_id or accept_public_id ) and @source . match ( /#{ public } /um )
546+ if @source . match ( /#{ public } (?:\s +[^'"]|\s *[\[ >])/um )
547+ return "public ID literal is missing"
548+ end
549+ unless @source . match ( /#{ public } \s +#{ PUBIDLITERAL } /um )
550+ return "invalid public ID literal"
551+ end
552+ if accept_public_id
553+ if @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +[^'"]/um )
554+ return "system ID literal is missing"
555+ end
556+ unless @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } /um )
557+ return "invalid system literal"
558+ end
559+ "garbage after system literal"
560+ else
561+ "garbage after public ID literal"
562+ end
563+ elsif accept_external_id and @source . match ( /#{ system } /um )
564+ if @source . match ( /#{ system } (?:\s +[^'"]|\s *[\[ >])/um )
565+ return "system literal is missing"
566+ end
567+ unless @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } /um )
568+ return "invalid system literal"
569+ end
570+ "garbage after system literal"
571+ else
572+ unless @source . match ( /\A \s *(?:PUBLIC|SYSTEM)\s /um )
573+ return "invalid ID type"
574+ end
575+ "ID type is missing"
576+ end
577+ end
578+
491579 def process_instruction
492580 match_data = @source . match ( INSTRUCTION_PATTERN , true )
493581 unless match_data
@@ -580,42 +668,6 @@ def parse_attributes(prefixes, curr_ns)
580668 end
581669 return attributes , closed
582670 end
583-
584- def notation_decl_invalid_details
585- name = /#{ NOTATIONDECL_START } \s +#{ NAME } /um
586- public = /#{ name } \s +PUBLIC/um
587- system = /#{ name } \s +SYSTEM/um
588- if @source . match ( /#{ NOTATIONDECL_START } \s *>/um )
589- return "name is missing"
590- elsif not @source . match ( /#{ name } [\s >]/um )
591- return "invalid name"
592- elsif @source . match ( /#{ name } \s *>/um )
593- return "ID type is missing"
594- elsif not @source . match ( /#{ name } \s +(?:PUBLIC|SYSTEM)[\s >]/um )
595- return "invalid ID type"
596- elsif @source . match ( /#{ public } /um )
597- if @source . match ( /#{ public } \s *>/um )
598- return "public ID literal is missing"
599- elsif not @source . match ( /#{ public } \s +#{ PUBIDLITERAL } /um )
600- return "invalid public ID literal"
601- elsif @source . match ( /#{ public } \s +#{ PUBIDLITERAL } [^\s >]/um )
602- return "garbage after public ID literal"
603- elsif not @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } /um )
604- return "invalid system literal"
605- elsif not @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s *>/um )
606- return "garbage after system literal"
607- end
608- elsif @source . match ( /#{ system } /um )
609- if @source . match ( /#{ system } \s *>/um )
610- return "system literal is missing"
611- elsif not @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } /um )
612- return "invalid system literal"
613- elsif not @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } \s *>/um )
614- return "garbage after system literal"
615- end
616- end
617- "end > is missing"
618- end
619671 end
620672 end
621673end
0 commit comments