#!/usr/bin/env python # sweet jeebus, this code is fugly. from BeautifulSoup import UnicodeDammit # http://www.crummy.com/software/BeautifulSoup/ import codecs, sys from glob import glob sys.stdout = codecs.getwriter('utf-8')(sys.stdout) # for printing to UTF-8 enabled consoles, YMMV # create and populate a directory this directory with the contents of # http://udhrinunicode.org/assemblies/udhr_txt.zip UDHR_REPO = "udhr/" pre = """ Languages by Average word length

Languages by Average word length

This table shows a listing of languages by average word length, as calculated from the texts at the UDHR in Unicode.

Caveats:

  1. My definition of "word" consists of splitting on space. (Hence screwed up counts for Amharic, Thai, etc, which don't use spaces.)
  2. I believe there are some incomplete texts in the UDHR collection I used, not sure.
""" post = """ """ def avg(seq): return float(sum(seq))/len(seq) def readudhr(): udhr = {} for fname in glob(UDHR_REPO + 'udhr_' + '*' + '.txt'): content = open(fname).read() content = UnicodeDammit(content).unicode name = content.splitlines()[0].split(' - ')[1] code = fname.replace('udhr/udhr_','').replace('.txt','') average_length = avg([len(word) for word in content.split()]) udhr[code] = (average_length, name, code, content) return udhr udhr = readudhr() rank = 1 print pre print "" print "" for average_length, name, code, content in sorted(udhr.values()): rank += 1 print """""" % (rank, average_length, name, code, content[400:600].replace('\n', '')) print "
RankLengthLanguagecodesample
%d%.2f%s%s%s
" print post