Skip to content

Instantly share code, notes, and snippets.

@nebuta
Created November 8, 2011 09:40
Show Gist options
  • Save nebuta/1347366 to your computer and use it in GitHub Desktop.
Save nebuta/1347366 to your computer and use it in GitHub Desktop.
Test algorithm with smaller dictionaries
require 'rubygems'
require 'hpricot'
$asciilist = (0x20..0x7e).to_a | [0x09,0x0a,0x0c,0x0d]
def parse(lines)
arr = Array.new(65536)
start = 0
lines.each{|line|
arr[start,256]=line.chomp.split("\t").map{|e| e.to_f}
start += 256
}
return arr
end
def isAscii?(b)
$asciilist.include? b
end
def test_judge(out,vector,bytes)
len = bytes.length
score = Hash.new
score[:utf8]=0
score[:eucjp]=0
score[:iso]=0
score[:shiftjis]=0
i = 0
ch_count = 0
while i < len
i += 1
next if isAscii? bytes[i-1]
while i<len and ch_count <= 100
break if isAscii? bytes[i]
score.each_key{|key|
score[key] += vector[key][(bytes[i-1])*256+bytes[i]]
}
ch_count += 1
i += 1
if ch_count>=100
sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
return sorted[0][0].to_s, ch_count, i
end
end
end
sorted = score.to_a.sort{|a,b| a[1] <=> b[1]}.reverse
out.puts score.to_a.sort{|a,b| a[0].to_s <=> b[0].to_s }.map{|a| "%.1f"%a[1]}.join("\t")
return sorted[0][0].to_s, ch_count, i
end
def judge(res,answer)
res =
case answer
when /shift[_-]jis/i
res == "shiftjis"
when /euc[_-]jp/i
res == "eucjp"
when /ISO-2022-JP/i
res == "iso"
when /UTF-8/i
res == "utf8"
else
"N/A"
end
if res == true
return "OK"
elsif res == false
return "NG"
else
return "N/A"
end
end
def main
coarse = [1,2,4,8,16,32,64,128]
encode = [:utf8,:eucjp,:iso,:shiftjis]
outs = Hash.new
coarse.each{|c|
f= open("result_c#{c}.txt","w")
outs[c] = f
}
vs = Hash.new
coarse.each{|c|
v = Hash.new
encode.each{|k|
filename = "sub_#{k.to_s}_c#{c}.txt"
puts "Parsing: #{filename}"
v[k] = parse(IO.readlines(filename))
}
vs[c] = v
}
Dir::chdir("web")
Dir::glob("web*.html").each{|file|
begin
doc = Hpricot(open(file))
(doc/'script').remove
text = (doc/:body).inner_text
(doc/:head).inner_html =~ /charset=['"]?(.+?)['"]/
$stderr.puts File.basename(file)
if $1 then
answer = $1
open(file+".txt","w"){|out|
out.print text
}
coarse.each{|c|
res, ch, index = test_judge(outs[c],vs[c],text.unpack('C*'))
outs[c].puts File.basename(file)
outs[c].puts [res,answer,judge(res,answer),ch,index].join("\t")
outs[c].puts
}
else
puts "skipped (no encoding info in HTML)"
end
rescue => e
puts "Error: " + File.basename(file)
p e
end
}
outs.each{|k,f|
f.close
}
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment