From ffa1b473f0f30f2cea4318f0b20a4cc9d7e4d331 Mon Sep 17 00:00:00 2001 From: halostatue Date: Sat, 28 Feb 2009 00:13:20 -0500 Subject: [PATCH 1/3] Added a gitignore; put the IANA downloader in the Rakefile. --- .gitignore | 6 ++ Rakefile | 252 +++++++++++++++++++++++++++-------------------------- 2 files changed, 136 insertions(+), 122 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..072a118 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.swp +html +doc +pkg +publish +coverage diff --git a/Rakefile b/Rakefile index bc89cf9..5e98857 100644 --- a/Rakefile +++ b/Rakefile @@ -42,8 +42,7 @@ Hoe.new PKG_NAME, PKG_VERSION do |p| p.clean_globs << "coverage" - p.spec_extras[:extra_rdoc_files] = MANIFEST.grep(/txt$/) - - ["Manifest.txt"] + p.spec_extras[:extra_rdoc_files] = MANIFEST.grep(/txt$/) - ["Manifest.txt"] end desc "Build a MIME::Types .tar.gz distribution." @@ -72,7 +71,7 @@ file PKG_TAR => [ :test ] do |t| begin unless File.directory?(File.dirname(t.name)) require 'fileutils' - File.mkdir_p File.dirname(t.name) + FileUtils.mkdir_p File.dirname(t.name) end tf = File.open(t.name, 'wb') gz = Zlib::GzipWriter.new(tf) @@ -118,138 +117,147 @@ task :build_manifest do |t| end desc "Download the current MIME type registrations from IANA." -task :download_from_iana do |t| -#!/usr/bin/ruby -w +task :iana, :save, :destination do |t, args| + save_type = args.save || :text + save_type = save_type.to_sym -require 'rubygems' -require 'open-uri' -require 'nokogiri' -require 'cgi' - -class IANAParser - include Comparable - - INDEX = %q(http://www.iana.org/assignments/media-types/) - CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)} - RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt} - IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt} - IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)} - - class << self - def load_index - @types ||= {} - - Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag| - href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href']) - next if href_match.nil? - type = href_match.captures[0] - @types[tag.content] = IANAParser.new(tag.content, type) - end - end - - attr_reader :types + case save_type + when :text, :both, :html + nil + else + raise "Unknown save type provided. Must be one of text, both, or html." end - def initialize(name, type) - @name = name - @type = type - @url = File.join(INDEX, @type) - end + destination = args.destination || "type-lists" - attr_reader :name - attr_reader :type - attr_reader :url - attr_reader :html + require 'open-uri' + require 'nokogiri' + require 'cgi' - def download(name = nil) - if name - @html = Nokogiri::HTML(open(name) { |f| f.read }) - else - @html = Nokogiri::HTML(open(@url) { |f| f.read }) - end - end + class IANAParser + include Comparable - def save_html - File.open("#@name.html", "wb") { |w| w.write @html } - end + INDEX = %q(http://www.iana.org/assignments/media-types/) + CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)} + RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt} + IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt} + IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)} - def <=>(o) - self.name <=> o.name - end + class << self + def load_index + @types ||= {} - def parse - nodes = html.xpath("//table//table//tr") - - # How many children does the first node have? - node_count = nodes.first.children.select { |node| node.elem? }.size - - @mime_types = nodes.map do |node| - next if node == nodes.first - elems = node.children.select { |n| n.elem? } - next if elems.size.zero? - raise "size mismatch #{elems.size} != #{node_count}" if node_count != elems.size - - case elems.size - when 3 - subtype_index = 1 - refnode_index = 2 - when 4 - subtype_index = 1 - refnode_index = 3 - else - raise "Unknown element size." - end - - subtype = elems[subtype_index].content.chomp.strip - refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref| - case ref['href'] - when CONTACT_PEOPLE - tag = CGI::unescape($1).chomp.strip - if tag == ref.content - "[#{ref.content}]" - else - "[#{ref.content}=#{tag}]" - end - when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS - "RFC#$1" - when %r{(https?://.*)} - "{#{ref.content}=#$1}" - else - ref + Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag| + href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href']) + next if href_match.nil? + type = href_match.captures[0] + @types[tag.content] = IANAParser.new(tag.content, type) end - } - refs = refnodes.join(',') + end + + attr_reader :types + end + + def initialize(name, type) + @name = name + @type = type + @url = File.join(INDEX, @type) + end + + attr_reader :name + attr_reader :type + attr_reader :url + attr_reader :html + + def download(name = nil) + @html = Nokogiri::HTML(open(name || @url) { |f| f.read }) + end + + def save_html + File.open("#@name.html", "wb") { |w| w.write @html } + end + + def <=>(o) + self.name <=> o.name + end + + def parse + nodes = html.xpath("//table//table//tr") + + # How many children does the first node have? + node_count = nodes.first.children.select { |node| node.elem? }.size + + @mime_types = nodes.map do |node| + next if node == nodes.first + elems = node.children.select { |n| n.elem? } + next if elems.size.zero? + raise "size mismatch #{elems.size} != #{node_count}" if node_count != elems.size + + case elems.size + when 3 + subtype_index = 1 + refnode_index = 2 + when 4 + subtype_index = 1 + refnode_index = 3 + else + raise "Unknown element size." + end + + subtype = elems[subtype_index].content.chomp.strip + refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref| + case ref['href'] + when CONTACT_PEOPLE + tag = CGI::unescape($1).chomp.strip + if tag == ref.content + "[#{ref.content}]" + else + "[#{ref.content}=#{tag}]" + end + when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS + "RFC#$1" + when %r{(https?://.*)} + "{#{ref.content}=#$1}" + else + ref + end + } + refs = refnodes.join(',') "#@type/#{subtype} 'IANA,#{refs}" - end.compact + end.compact - @mime_types + @mime_types + end + + def save_text + File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") } + end end - def save_text - File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") } + puts "Downloading index of MIME types from #{IANAParser::INDEX}." + IANAParser.load_index + + require 'fileutils' + FileUtils.mkdir_p destination + Dir.chdir destination do + IANAParser.types.values.sort.each do |parser| + next if parser.name == "example" or parser.name == "mime" + puts "Downloading #{parser.name} from #{parser.url}" + parser.download + + if :html == save_type || :both == save_type + puts "Saving #{parser.name}.html" + parser.save_html + end + + puts "Parsing #{parser.name} HTML" + parser.parse + + if :text == save_type || :both == save_type + puts "Saving #{parser.name}.txt" + parser.save_text + end + end end end - -puts "Downloading index of MIME types from #{IANAParser::INDEX}." -IANAParser.load_index - -IANAParser.types.values.sort.each do |parser| - next if parser.name == "example" or parser.name == "mime" - puts "Downloading #{parser.name} from #{parser.url}" - parser.download - puts "Saving #{parser.name}.html" - parser.save_html - puts "Parsing #{parser.name}" - parser.parse - puts "Saving #{parser.name}.txt" - parser.save_text -end - -# foo = IANAParser.types['application'] -# foo.download("application.html") -# foo.parse -# foo = IANAParser.types['image'] -# foo.download("image.html") -# foo.parse -end From d1bc6380c1c0e58c7c90dd77045e8187ff8ba4af Mon Sep 17 00:00:00 2001 From: halostatue Date: Sat, 28 Feb 2009 00:14:00 -0500 Subject: [PATCH 2/3] Updated gitignore to ignore type-lists; removed get-latest.rb since it's now in the rakefile. --- .gitignore | 1 + type-lists/get-latest.rb | 133 --------------------------------------- 2 files changed, 1 insertion(+), 133 deletions(-) delete mode 100755 type-lists/get-latest.rb diff --git a/.gitignore b/.gitignore index 072a118..c1ee442 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ doc pkg publish coverage +type-lists diff --git a/type-lists/get-latest.rb b/type-lists/get-latest.rb deleted file mode 100755 index 6a9edda..0000000 --- a/type-lists/get-latest.rb +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/ruby -w - -require 'rubygems' -require 'open-uri' -require 'nokogiri' -require 'cgi' - -class IANAParser - include Comparable - - INDEX = %q(http://www.iana.org/assignments/media-types/) - CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)} - RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt} - IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt} - IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)} - - class << self - def load_index - @types ||= {} - - Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag| - href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href']) - next if href_match.nil? - type = href_match.captures[0] - @types[tag.content] = IANAParser.new(tag.content, type) - end - end - - attr_reader :types - end - - def initialize(name, type) - @name = name - @type = type - @url = File.join(INDEX, @type) - end - - attr_reader :name - attr_reader :type - attr_reader :url - attr_reader :html - - def download(name = nil) - if name - @html = Nokogiri::HTML(open(name) { |f| f.read }) - else - @html = Nokogiri::HTML(open(@url) { |f| f.read }) - end - end - - def save_html - File.open("#@name.html", "wb") { |w| w.write @html } - end - - def <=>(o) - self.name <=> o.name - end - - def parse - nodes = html.xpath("//table//table//tr") - - # How many children does the first node have? - node_count = nodes.first.children.select { |node| node.elem? }.size - - @mime_types = nodes.map do |node| - next if node == nodes.first - elems = node.children.select { |n| n.elem? } - next if elems.size.zero? - raise "size mismatch #{elems.size} != #{node_count}" if node_count != elems.size - - case elems.size - when 3 - subtype_index = 1 - refnode_index = 2 - when 4 - subtype_index = 1 - refnode_index = 3 - else - raise "Unknown element size." - end - - subtype = elems[subtype_index].content.chomp.strip - refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref| - case ref['href'] - when CONTACT_PEOPLE - tag = CGI::unescape($1).chomp.strip - if tag == ref.content - "[#{ref.content}]" - else - "[#{ref.content}=#{tag}]" - end - when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS - "RFC#$1" - when %r{(https?://.*)} - "{#{ref.content}=#$1}" - else - ref - end - } - refs = refnodes.join(',') - - "#@type/#{subtype} 'IANA,#{refs}" - end.compact - - @mime_types - end - - def save_text - File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") } - end -end - -puts "Downloading index of MIME types from #{IANAParser::INDEX}." -IANAParser.load_index - -IANAParser.types.values.sort.each do |parser| - next if parser.name == "example" or parser.name == "mime" - puts "Downloading #{parser.name} from #{parser.url}" - parser.download - puts "Saving #{parser.name}.html" - parser.save_html - puts "Parsing #{parser.name}" - parser.parse - puts "Saving #{parser.name}.txt" - parser.save_text -end - -# foo = IANAParser.types['application'] -# foo.download("application.html") -# foo.parse -# foo = IANAParser.types['image'] -# foo.download("image.html") -# foo.parse From 54cd4193077d2decf875d1c0b7c12b15f270d6ed Mon Sep 17 00:00:00 2001 From: halostatue Date: Sat, 28 Feb 2009 01:55:53 -0500 Subject: [PATCH 3/3] Fixed an encoding issue for 1.9.1 --- lib/mime/types.rb.data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mime/types.rb.data b/lib/mime/types.rb.data index c25b239..0ca2cf5 100644 --- a/lib/mime/types.rb.data +++ b/lib/mime/types.rb.data @@ -1,4 +1,4 @@ -# vim: ft=ruby enc=utf-8 +# vim: ft=ruby encoding=utf-8 #-- # MIME::Types # A Ruby implementation of a MIME Types information library. Based in spirit