from os import listdir
files = [a for a in listdir(".") if ".csv" in a]
# process all examples with id numbers higher than m:
m = 0
#m = 78
#m = 78 + 25
# write output to:
out = open("some20.txt","w")

some = [143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 98, 99, 100, 101, 102,73, 74, 75, 76, 77]

def overlap(a, b):
	A = set(a.split(","))
	B = set(b.split(","))
	if len(A) == 1: return a
	if A.intersection(B):
		return A.intersection(B).pop()
	return A.pop()

from nltk import FreqDist
table = [FreqDist() for a in range(200)]
hist = FreqDist()
for f in files:
	file1 = open(f).read().splitlines()[1:]
	for a in file1:
		n = int(a.split(";")[0])
		if n < m: continue
		if n not in some: continue
		a = a.split(";")[1].strip()
		out.write("%s	%d	%s\n" % (f.split(".")[0], n, a))
		table[n][a] += 1
		hist[a] += 1

for n,a in enumerate(table):
	if a.items(): print n, " ".join("%s:%d" %(b,c) for b,c in a.items())

print
print hist
out.close()
