diff --git a/generate_test_data.py b/generate_test_data.py new file mode 100644 index 0000000..b60c51b --- /dev/null +++ b/generate_test_data.py @@ -0,0 +1,6 @@ + +import random + +for i in range(1000000): + print( str(random.random())+","+str(int(random.expovariate(1/5))) ) + diff --git a/phisto.py b/phisto.py new file mode 100644 index 0000000..25d8b85 --- /dev/null +++ b/phisto.py @@ -0,0 +1,110 @@ +import sys +from multiprocessing import Pool + +# Compute the histogram of a csv file with a python parallel program +# Input file format: +# name,count\n +# Output the frequency of "count". + +# Stolen from http://mikecvet.wordpress.com/2010/07/02/parallel-mapreduce-in-python/ +# Adapted to histogram computation instead of word count +# And ported to python3 + +""" +Given a list of tokens, return a list of tuples of +titlecased (or proper noun) tokens and a count of '1'. +Also remove any leading or trailing punctuation from +each token. +""" +def Map(L): + results = [] + for w in L: + results.append ((w, 1)) + return results + + +""" +Group the sublists of (token, 1) pairs into a term-frequency-list +map, so that the Reduce operation later can work on sorted +term counts. The returned result is a dictionary with the structure +{token : [(token, 1), ...] .. } +""" +def Partition(L): + tf = {} + for sublist in L: + for p in sublist: + # Append the tuple to the list in the map + try: + tf[p[0]].append (p) + except KeyError: + tf[p[0]] = [p] + return tf + + +""" +Given a (token, [(token, 1) ...]) tuple, collapse all the +count tuples from the Map operation into a single term frequency +number for this token, and return a final tuple (token, frequency). +""" +def Reduce(Mapping): + return (Mapping[0], sum(pair[1] for pair in Mapping[1])) + + + +""" +Load the contents the file at the given +path into a big list and return it. +""" +def load(path): + stars = [] + with open(path, "r") as f: + for line in f: + stars.append(int(line.split()[1])) + + # Efficiently concatenate Python string objects + # return (''.join(stars)).split () + return stars + + +""" +A generator function for chopping up a given list into chunks of +length n. +""" +def chunks(l, n): + for i in range(0, len(l), n): + yield l[i:i+n] + + +if __name__ == '__main__': + + if (len(sys.argv) != 3): + print("Usage: phisto file nprocs") + sys.exit(1) + + nprocs=int(sys.argv[2]) + + print("Load file, stuff it into a string",file=sys.stderr) + stars = load (sys.argv[1]) + + print("Build a pool of 8 processes",file=sys.stderr) + pool = Pool(processes=nprocs) + + print("Fragment the string data into 8 chunks",file=sys.stderr) + partitioned_stars = list(chunks(stars, len(stars) // nprocs)) + + print("Generate count tuples for title-cased tokens",file=sys.stderr) + single_count_tuples = pool.map(Map, partitioned_stars) + + print("Organize the count tuples; lists of tuples by token key",file=sys.stderr) + token_to_tuples = Partition(single_count_tuples) + + print("Collapse the lists of tuples into total term frequencies",file=sys.stderr) + term_frequencies = pool.map(Reduce, token_to_tuples.items()) + + print("Sort the term frequencies in increasing order",file=sys.stderr) + # term_frequencies.sort(key=lambda x: x[1]) # nb of projects + term_frequencies.sort(key=lambda x: x[0]) # nb of stars + + for pair in term_frequencies[:20]: + print( "%i occurs %i times" % pair ) +