r1479@bean: ser | 2008-01-19 14:26:31 -0500

r1483@bean: ser | 2008-01-19 14:47:23 -0500 Sam's fixes: * Don't blow up on empty documents * Add a test case for sorted attributes * Making the output predictable simplifies unit tests, and doesn't cost much given that most xml element have few attributes * Ruby 1.9 revision 14922 is more strict * Complete Ticket #134 * Fix for ticket #121 * Fix for ticket #124 * Fix for ticket #128 * Fix ticket #133 * Ticket #131 (Support Ruby 1.9) * Fix for ticket #127 * Fix for ticket #123 * Add missing data needed by test case r1481@bean (orig r1303): ser | 2008-01-19 17:22:32 -0500 Tagged for release r1482@bean (orig r1304): ser | 2008-01-19 17:27:10 -0500 Version bump git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15141 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2008-01-20 04:31:57 +00:00 · 2008-01-20 04:31:57 +00:00 · 66aeb2f708
--- a/lib/rexml/attribute.rb
+++ b/lib/rexml/attribute.rb
@ -17,6 +17,8 @@ module REXML
 		attr_writer :normalized	
 		PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um

+    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
+
 		# Constructor.
    # FIXME: The parser doesn't catch illegal characters in attributes
    #
@ -110,15 +112,16 @@ module REXML
 			end
 		end

-		# Returns the attribute value, with entities replaced
-		def to_s
-			return @normalized if @normalized
-
-			doctype = nil
+    def doctype
 			if @element
 				doc = @element.document
 				doctype = doc.doctype if doc
 			end
+    end
+
+		# Returns the attribute value, with entities replaced
+		def to_s
+			return @normalized if @normalized

 			@normalized = Text::normalize( @unnormalized, doctype )
 			@unnormalized = nil
@ -129,11 +132,6 @@ module REXML
 		# have been expanded to their values
 		def value
 			return @unnormalized if @unnormalized
-			doctype = nil
-			if @element
-				doc = @element.document
-				doctype = doc.doctype if doc
-			end
 			@unnormalized = Text::unnormalize( @normalized, doctype )
 			@normalized = nil
      @unnormalized
@ -150,6 +148,11 @@ module REXML
 		# Returns this attribute
 		def element=( element )
 			@element = element
+
+      if @normalized
+        Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype )
+      end
+
 			self
 		end

--- a/lib/rexml/cdata.rb
+++ b/lib/rexml/cdata.rb
@ -13,7 +13,7 @@ module REXML
 		#  CData.new( "Here is some CDATA" )
 		#  CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element )
 		def initialize( first, whitespace=true, parent=nil )
-			super( first, whitespace, parent, true, true, ILLEGAL )
+			super( first, whitespace, parent, false, true, ILLEGAL )
 		end

 		# Make a copy of this object
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@ -558,7 +558,19 @@ module REXML
        prefix = namespaces.index(namespace) if namespace
      end
      prefix = nil if prefix == 'xmlns'
+
+      ret_val = 
        attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" )
+
+      return ret_val unless ret_val.nil?
+      return nil if prefix.nil?
+
+      # now check that prefix'es namespace is not the same as the
+      # default namespace
+      return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] )
+
+      attributes.get_attribute( name )
+
    end

    # Evaluates to +true+ if this element has any attributes set, false
@ -675,7 +687,7 @@ module REXML
    #  out = ''
    #  doc.write( out )     #-> doc is written to the string 'out'
    #  doc.write( $stdout ) #-> doc written to the console
-    def write(writer=$stdout, indent=-1, transitive=false, ie_hack=false)
+    def write(output=$stdout, indent=-1, transitive=false, ie_hack=false)
      Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters")
      formatter = if indent > -1
          if transitive
@ -1217,14 +1229,17 @@ module REXML
    # 
    # Method contributed by Henrik Martensson
    def get_attribute_ns(namespace, name)
+      result = nil
      each_attribute() { |attribute|
        if name == attribute.name &&
          namespace == attribute.namespace() &&
          ( !namespace.empty? || !attribute.fully_expanded_name.index(':') )
-          return attribute
+          # foo will match xmlns:foo, but only if foo isn't also an attribute
+          result = attribute if !result or !namespace.empty? or 
+                                !attribute.fully_expanded_name.index(':')
        end
      }
-      nil
+      result
    end
  end
 end
--- a/lib/rexml/formatters/default.rb
+++ b/lib/rexml/formatters/default.rb
@ -63,7 +63,7 @@ module REXML
      def write_element( node, output )
        output << "<#{node.expanded_name}"

-        node.attributes.each_attribute do |attr|
+        node.attributes.to_a.sort_by {|attr| attr.name}.each do |attr|
          output << " "
          attr.write( output )
        end unless node.attributes.empty?
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@ -25,7 +25,20 @@ module REXML
    #
    # Nat Price gave me some good ideas for the API.
    class BaseParser
-      NCNAME_STR= '[\w:][\-\w\d.]*'
+      if String.method_defined? :encode
+        # Oniguruma / POSIX [understands unicode]
+        LETTER = '[[:alpha:]]'
+        DIGIT = '[[:digit:]]'
+      else
+        # Ruby < 1.9 [doesn't understand unicode]
+        LETTER = 'a-zA-Z'
+        DIGIT = '\d'
+      end
+
+      COMBININGCHAR = '' # TODO
+      EXTENDER = ''      # TODO
+
+      NCNAME_STR= "[#{LETTER}_:][-#{LETTER}#{DIGIT}._:#{COMBININGCHAR}#{EXTENDER}]*"
      NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
      UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"

@ -33,7 +46,7 @@ module REXML
      NAME = "([\\w:]#{NAMECHAR}*)"
      NMTOKEN = "(?:#{NAMECHAR})+"
      NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
-      REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
+      REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
      REFERENCE_RE = /#{REFERENCE}/

      DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
@ -340,6 +353,12 @@ module REXML
              raise REXML::ParseException.new("Malformed node", @source) unless md
              if md[0][2] == ?-
                md = @source.match( COMMENT_PATTERN, true )
+
+                case md[1]
+                when /--/, /-$/
+                  raise REXML::ParseException.new("Malformed comment", @source)
+                end
+
                return [ :comment, md[1] ] if md
              else
                md = @source.match( CDATA_PATTERN, true )
@ -384,6 +403,12 @@ module REXML
                  elsif b
                    prefixes << b unless b == "xml"
                  end
+
+                  if attributes.has_key? a
+                    msg = "Duplicate attribute #{a.inspect}"
+                    raise REXML::ParseException.new( msg, @source, self)
+                  end
+
                  attributes[a] = e 
                }
              end
@ -470,15 +495,12 @@ module REXML
              if entity_value
                re = /&#{entity_reference};/
                rv.gsub!( re, entity_value )
-              end
-            end
-          end
-          matches.each do |entity_reference|
-            unless filter and filter.include?(entity_reference)
+              else
                er = DEFAULT_ENTITIES[entity_reference]
                rv.gsub!( er[0], er[2] ) if er
              end
            end
+          end
          rv.gsub!( /&amp;/, '&' )
        end
        rv
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@ -11,8 +11,8 @@
 #
 # Main page:: http://www.germane-software.com/software/rexml
 # Author:: Sean Russell <serATgermaneHYPHENsoftwareDOTcom>
-# Version:: 3.1.7.2
-# Date:: 2007/275
+# Date:: 2008/019
+# Version:: 3.1.7.3
 # 
 # This API documentation can be downloaded from the REXML home page, or can
 # be accessed online[http://www.germane-software.com/software/rexml_doc]
@ -21,9 +21,9 @@
 # or can be accessed 
 # online[http://www.germane-software.com/software/rexml/docs/tutorial.html]
 module REXML
-  COPYRIGHT = "Copyright © 2001-2007 Sean Russell <ser@germane-software.com>"
-  DATE = "2007/275"
-  VERSION = "3.1.7.2"
+  COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
+  DATE = "2008/019"
+  VERSION = "3.1.7.3"
  REVISION = "$Revision$".gsub(/\$Revision:|\$/,'').strip

  Copyright = COPYRIGHT
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@ -147,7 +147,7 @@ module REXML
      # the XML spec.  If there is one, we can determine the encoding from
      # it.
      @buffer = ""
-      str = @source.read( 2 )
+      str = @source.read( 2 ) || ''
      if encoding
        self.encoding = encoding
      elsif str[0,2] == "\xfe\xff"
@ -161,7 +161,7 @@ module REXML
      else
        @line_break = ">"
      end
-      super str+@source.readline( @line_break )
+      super( @source.eof? ? str : str+@source.readline( @line_break ) )
    end

    def scan(pattern, cons=false)
@ -231,7 +231,7 @@ module REXML
    end

    def position
-      @er_source.stat.pipe? ? 0 : @er_source.pos
+      @er_source.pos rescue 0
    end

    # @return the current line in the source
--- a/lib/rexml/text.rb
+++ b/lib/rexml/text.rb
@ -18,8 +18,40 @@ module REXML
    # If +raw+ is true, then REXML leaves the value alone
    attr_accessor :raw

-    ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
+    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
    NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ 
+    VALID_CHAR = [
+      0x9, 0xA, 0xD,
+      (0x20..0xD7FF),
+      (0xE000..0xFFFD),
+      (0x10000..0x10FFFF)
+    ]
+
+    if String.method_defined? :encode
+      VALID_XML_CHARS = Regexp.new('^['+
+        VALID_CHAR.map { |item|
+          case item
+          when Fixnum
+            [item].pack('U').force_encoding('utf-8')
+          when Range
+            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
+          end
+        }.join +
+      ']*$')
+    else
+      VALID_XML_CHARS = /^(
+           [\x09\x0A\x0D\x20-\x7E]            # ASCII
+         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
+         |  \xEF[\x80-\xBE]{2}                #
+         |  \xEF\xBF[\x80-\xBD]               # excluding U+fffe and U+ffff
+         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+       )*$/x; 
+    end

    # Constructor
    # +arg+ if a String, the content is set to the String.  If a Text,
@ -58,7 +90,7 @@ module REXML
    #
    # +pattern+ INTERNAL USE ONLY
    def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 
-      entity_filter=nil, illegal=ILLEGAL )
+      entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )

      @raw = false

@ -85,10 +117,54 @@ module REXML

      @string.gsub!( /\r\n?/, "\n" )

+      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
+    end
+
+    def parent= parent
+      super(parent)
+      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
+    end
+
    # check for illegal characters
-      if @raw
-        if @string =~ illegal
-          raise "Illegal character '#{$1}' in raw string \"#{@string}\""
+    def Text.check string, pattern, doctype
+
+      # illegal anywhere
+      if string !~ VALID_XML_CHARS
+        if String.method_defined? :encode
+          string.chars.each do |c|
+            case c.ord
+            when *VALID_CHAR
+            else
+              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
+            end
+          end
+        else
+          string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/) do |c|
+            case c.unpack('U')
+            when *VALID_CHAR
+            else
+              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
+            end
+          end
+        end
+      end
+
+      # context sensitive
+      string.scan(pattern).each do
+        if $1[-1] != ?;
+          raise "Illegal character '#{$1}' in raw string \"#{string}\""
+        elsif $1[0] == ?&
+          if $5 and $5[0] == ?#
+            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
+            when *VALID_CHAR
+            else
+              raise "Illegal character '#{$1}' in raw string \"#{string}\""
+            end
+          elsif $3 and !SUBSTITUTES.include?($1)
+            if !doctype or !doctype.entities.has_key?($3)
+              raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
+            end
+          end
        end
      end
    end
@ -120,6 +196,13 @@ module REXML
      to_s() <=> other.to_s
    end

+    def doctype
+      if @parent
+        doc = @parent.document
+        doc.doctype if doc
+      end
+    end
+
    REFERENCE = /#{Entity::REFERENCE}/
    # Returns the string value of this text node.  This string is always
    # escaped, meaning that it is a valid XML text node string, and all
@ -138,12 +221,6 @@ module REXML
      return @string if @raw
      return @normalized if @normalized

-      doctype = nil
-      if @parent
-        doc = @parent.document
-        doctype = doc.doctype if doc
-      end
-
      @normalized = Text::normalize( @string, doctype, @entity_filter )
    end

@ -165,12 +242,7 @@ module REXML
    #   u = Text.new( "sean russell", false, nil, true )
    #   u.value   #-> "sean russell"
    def value
-      @unnormalized if @unnormalized
-      doctype = nil
-      if @parent
-        doc = @parent.document
-        doctype = doc.doctype if doc
-      end
+      return @unnormalized if @unnormalized
      @unnormalized = Text::unnormalize( @string, doctype )
    end

@ -286,7 +358,7 @@ module REXML
    EREFERENCE = /&(?!#{Entity::NAME};)/
    # Escapes all possible entities
    def Text::normalize( input, doctype=nil, entity_filter=nil )
-      copy = input
+      copy = input.to_s
      # Doing it like this rather than in a loop improves the speed
      #copy = copy.gsub( EREFERENCE, '&amp;' )
      copy = copy.gsub( "&", "&amp;" )