2011-10-22 08:27:05 +04:00
|
|
|
# -*- ruby encoding: utf-8 -*-
|
2006-02-13 00:27:22 +03:00
|
|
|
|
2009-02-24 08:04:55 +03:00
|
|
|
require 'rubygems'
|
|
|
|
require 'hoe'
|
2005-07-08 15:59:37 +04:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
Hoe.plugin :doofus
|
|
|
|
Hoe.plugin :gemspec
|
|
|
|
Hoe.plugin :rubyforge
|
|
|
|
Hoe.plugin :git
|
|
|
|
Hoe.plugin :minitest
|
2006-02-13 00:27:22 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
spec = Hoe.spec 'mime-types' do
|
|
|
|
self.rubyforge_name = self.name
|
2006-02-13 00:27:22 +03:00
|
|
|
|
2011-10-22 08:30:26 +04:00
|
|
|
developer('Austin Ziegler', 'austin@rubyforge.org')
|
2005-07-08 15:59:37 +04:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
self.url = "http://mime-types.rubyforge.org/"
|
2011-10-26 08:15:15 +04:00
|
|
|
self.remote_rdoc_dir = '.'
|
2011-10-22 08:27:05 +04:00
|
|
|
self.rsync_args << ' --exclude=statsvn/'
|
2005-07-08 15:59:37 +04:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
self.history_file = 'History.rdoc'
|
|
|
|
self.readme_file = 'README.rdoc'
|
|
|
|
self.extra_rdoc_files = FileList["*.rdoc"].to_a
|
2005-07-08 15:59:37 +04:00
|
|
|
|
2011-10-24 07:19:35 +04:00
|
|
|
self.extra_dev_deps << ['nokogiri', '~> 1.5']
|
2011-10-22 08:27:05 +04:00
|
|
|
self.extra_dev_deps << ['minitest', '~> 2.0']
|
|
|
|
self.extra_dev_deps << ['hoe-doofus', '~> 1.0']
|
|
|
|
self.extra_dev_deps << ['hoe-gemspec', '~> 1.0']
|
|
|
|
self.extra_dev_deps << ['hoe-git', '~> 1.0']
|
|
|
|
self.extra_dev_deps << ['hoe-seattlerb', '~> 1.0']
|
2005-07-08 15:59:37 +04:00
|
|
|
end
|
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
namespace :mime do
|
|
|
|
desc "Download the current MIME type registrations from IANA."
|
|
|
|
task :iana, :save, :destination do |t, args|
|
2011-10-24 07:19:35 +04:00
|
|
|
save_type = (args.save || :text).to_sym
|
2009-03-01 08:03:21 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
case save_type
|
|
|
|
when :text, :both, :html
|
|
|
|
nil
|
2009-02-24 08:04:55 +03:00
|
|
|
else
|
2011-10-22 08:27:05 +04:00
|
|
|
raise "Unknown save type provided. Must be one of text, both, or html."
|
2005-07-08 15:59:37 +04:00
|
|
|
end
|
2006-02-13 00:46:50 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
destination = args.destination || "type-lists"
|
2009-02-24 08:04:55 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
require 'open-uri'
|
|
|
|
require 'nokogiri'
|
|
|
|
require 'cgi'
|
2006-02-13 00:46:50 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
class IANAParser
|
|
|
|
include Comparable
|
2009-03-01 07:41:46 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
INDEX = %q(http://www.iana.org/assignments/media-types/)
|
|
|
|
CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)}
|
|
|
|
RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt}
|
|
|
|
IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt}
|
|
|
|
IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)}
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
class << self
|
|
|
|
def load_index
|
|
|
|
@types ||= {}
|
2009-02-28 08:13:20 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag|
|
|
|
|
href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href'])
|
|
|
|
next if href_match.nil?
|
|
|
|
type = href_match.captures[0]
|
|
|
|
@types[tag.content] = IANAParser.new(tag.content, type)
|
|
|
|
end
|
2009-02-28 08:13:20 +03:00
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
attr_reader :types
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
def initialize(name, type)
|
|
|
|
@name = name
|
|
|
|
@type = type
|
|
|
|
@url = File.join(INDEX, @type)
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
attr_reader :name
|
|
|
|
attr_reader :type
|
|
|
|
attr_reader :url
|
|
|
|
attr_reader :html
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
def download(name = nil)
|
|
|
|
@html = Nokogiri::HTML(open(name || @url) { |f| f.read })
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
def save_html
|
|
|
|
File.open("#@name.html", "wb") { |w| w.write @html }
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
def <=>(o)
|
|
|
|
self.name <=> o.name
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
def parse
|
|
|
|
nodes = html.xpath("//table//table//tr")
|
|
|
|
|
|
|
|
# How many <td> children does the first node have?
|
2011-10-24 07:19:35 +04:00
|
|
|
node_count = nodes.first.children.select { |n| n.elem? }.size
|
|
|
|
|
|
|
|
if node_count == 1
|
|
|
|
# The title node doesn't have what we expect. Let's try it based
|
|
|
|
# on the first real node.
|
|
|
|
node_count = nodes.first.next.children.select { |n| n.elem? }.size
|
|
|
|
end
|
2011-10-22 08:27:05 +04:00
|
|
|
|
|
|
|
@mime_types = nodes.map do |node|
|
|
|
|
next if node == nodes.first
|
|
|
|
elems = node.children.select { |n| n.elem? }
|
|
|
|
next if elems.size.zero?
|
2011-10-24 07:19:35 +04:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
raise "size mismatch #{elems.size} != #{node_count}" if node_count != elems.size
|
|
|
|
|
|
|
|
case elems.size
|
|
|
|
when 3
|
|
|
|
subtype_index = 1
|
|
|
|
refnode_index = 2
|
|
|
|
when 4
|
|
|
|
subtype_index = 1
|
|
|
|
refnode_index = 3
|
|
|
|
else
|
|
|
|
raise "Unknown element size."
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
subtype = elems[subtype_index].content.chomp.strip
|
|
|
|
refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref|
|
|
|
|
case ref['href']
|
|
|
|
when CONTACT_PEOPLE
|
|
|
|
tag = CGI::unescape($1).chomp.strip
|
|
|
|
if tag == ref.content
|
2009-02-28 07:35:41 +03:00
|
|
|
"[#{ref.content}]"
|
2011-10-22 08:27:05 +04:00
|
|
|
else
|
2009-02-28 07:35:41 +03:00
|
|
|
"[#{ref.content}=#{tag}]"
|
2011-10-22 08:27:05 +04:00
|
|
|
end
|
|
|
|
when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS
|
2009-02-28 07:35:41 +03:00
|
|
|
"RFC#$1"
|
2011-10-22 08:27:05 +04:00
|
|
|
when %r{(https?://.*)}
|
2009-02-28 07:35:41 +03:00
|
|
|
"{#{ref.content}=#$1}"
|
2011-10-22 08:27:05 +04:00
|
|
|
else
|
|
|
|
ref
|
|
|
|
end
|
|
|
|
}
|
|
|
|
refs = refnodes.join(',')
|
2009-02-28 07:35:41 +03:00
|
|
|
|
|
|
|
"#@type/#{subtype} 'IANA,#{refs}"
|
2011-10-22 08:27:05 +04:00
|
|
|
end.compact
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
@mime_types
|
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
def save_text
|
|
|
|
File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") }
|
|
|
|
end
|
2009-02-28 08:13:20 +03:00
|
|
|
end
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
puts "Downloading index of MIME types from #{IANAParser::INDEX}."
|
|
|
|
IANAParser.load_index
|
2009-02-28 08:13:20 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
require 'fileutils'
|
|
|
|
FileUtils.mkdir_p destination
|
|
|
|
Dir.chdir destination do
|
|
|
|
IANAParser.types.values.sort.each do |parser|
|
|
|
|
next if parser.name == "example" or parser.name == "mime"
|
|
|
|
puts "Downloading #{parser.name} from #{parser.url}"
|
|
|
|
parser.download
|
2009-02-28 08:13:20 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
if :html == save_type || :both == save_type
|
|
|
|
puts "Saving #{parser.name}.html"
|
|
|
|
parser.save_html
|
|
|
|
end
|
2009-02-28 08:13:20 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
puts "Parsing #{parser.name} HTML"
|
|
|
|
parser.parse
|
2009-02-28 07:35:41 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
if :text == save_type || :both == save_type
|
|
|
|
puts "Saving #{parser.name}.txt"
|
|
|
|
parser.save_text
|
|
|
|
end
|
2009-02-28 08:13:20 +03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2009-02-28 18:50:52 +03:00
|
|
|
|
2011-10-22 08:27:05 +04:00
|
|
|
desc "Shows known MIME type sources."
|
|
|
|
task :mime_type_sources do
|
|
|
|
puts <<-EOS
|
2009-02-28 18:50:52 +03:00
|
|
|
http://www.ltsw.se/knbase/internet/mime.htp
|
|
|
|
http://www.webmaster-toolkit.com/mime-types.shtml
|
|
|
|
http://plugindoc.mozdev.org/winmime.php
|
|
|
|
http://standards.freedesktop.org/shared-mime-info-spec/shared-mime-info-spec-latest.html
|
|
|
|
http://www.feedforall.com/mime-types.htm
|
|
|
|
http://www.iana.org/assignments/media-types/
|
|
|
|
EOS
|
2009-03-01 18:07:45 +03:00
|
|
|
end
|
|
|
|
end
|
2011-10-22 08:27:05 +04:00
|
|
|
|
|
|
|
# vim: syntax=ruby
|