#!/usr/bin/env python
# sweet jeebus, this code is fugly.
from BeautifulSoup import UnicodeDammit # http://www.crummy.com/software/BeautifulSoup/
import codecs, sys
from glob import glob
sys.stdout = codecs.getwriter('utf-8')(sys.stdout) # for printing to UTF-8 enabled consoles, YMMV
# create and populate a directory this directory with the contents of
# http://udhrinunicode.org/assemblies/udhr_txt.zip
UDHR_REPO = "udhr/"
pre = """
Languages by Average word length
Languages by Average word length
This table shows a listing of languages by average word length, as calculated from the texts at the UDHR in Unicode.
Caveats:
- My definition of "word" consists of splitting on space. (Hence screwed up counts for Amharic, Thai, etc, which don't use spaces.)
- I believe there are some incomplete texts in the UDHR collection I used, not sure.
"""
post = """
"""
def avg(seq):
return float(sum(seq))/len(seq)
def readudhr():
udhr = {}
for fname in glob(UDHR_REPO + 'udhr_' + '*' + '.txt'):
content = open(fname).read()
content = UnicodeDammit(content).unicode
name = content.splitlines()[0].split(' - ')[1]
code = fname.replace('udhr/udhr_','').replace('.txt','')
average_length = avg([len(word) for word in content.split()])
udhr[code] = (average_length, name, code, content)
return udhr
udhr = readudhr()
rank = 1
print pre
print ""
print "| Rank | Length | Language | code | sample |
"
for average_length, name, code, content in sorted(udhr.values()):
rank += 1
print """| %d | %.2f | %s | %s | %s |
""" % (rank, average_length, name, code, content[400:600].replace('\n', ''))
print "
"
print post