# -*- coding: utf-8 -*- #!/usr/bin/env python from glob import glob from textual import uread, depunctuate import sys files = sys.argv[1:] def words(text): text = depunctuate(text) text = text.lower() thewords = text.split() return thewords def affixiness(text): wordlist = words(text) numwords = len(wordlist) numforms = len(set(wordlist)) return float(numforms) / float(numwords) scores = [] for f in files: text = uread(f) scores.append((affixiness(text), f)) scores.sort() from ethnologue import code2lang for score, fname in scores: fname = unicode(fname) try: language = code2lang[fname.replace('udhr/udhr_','').replace('.txt','')] except KeyError: language = fname print score, '\t', language