# http://chimera.labs.oreilly.com/books/1230000000393/ch01.html#_determining_the_most_frequently_occurring_items_in_a_sequence
# 1.12 determining the most frequently occurring items in a sequence
| from collections import Counter def test_Counter(): artical1 = [ 'As', 'a', 'news', 'aggregator', 'site', 'Google', 'uses', 'its', 'own', 'software', 'to', 'determine', 'which', 'stories', 'to', 'show', 'from', 'the', 'online', 'news', 'sources', 'it', 'watches', 'Human', 'editorial', 'input', 'does', 'come', 'into', 'the', 'system,', 'however,', 'in', 'choosing', 'exactly', 'which', 'sources', 'Google', 'News', 'will', 'pick', 'from.', 'This', 'is', 'where', 'some', 'of', 'the', 'controversy', 'over', 'Google', 'News', 'originates,', 'when', 'some', 'news', 'sources', 'are', 'included', 'when', 'visitors', 'feel', 'they', 'dont', 'deserve', 'it', 'and', 'when', 'other', 'news', 'sources', 'are', 'excluded', 'when', 'visitors', 'feel', 'they', 'ought', 'to', 'be', 'included', 'For', 'examples,', 'see', 'the', 'above', 'mentions', 'of', 'Indymedia,', 'or', 'National', ] artical2 = [ 'from', 'somewhere', 'to', 'sometime', 'to', 'somebody' ] # Counter works on all sequence of hashable inout items word_counts_1 = Counter(artical1) top_five = word_counts_1.most_common(5) print "artical1 top_five =", top_five for (key, value) in dict(top_five).items(): print " %s = %d " % (key, value) print "----------" word_counts_2 = Counter(artical2) top_five = word_counts_2.most_common(5) print "artical2 top_five =", top_five for (key, value) in dict(top_five).items(): print " %s = %d " % (key, value) print "----------" sum_1_2 = word_counts_1 + word_counts_2 top_five = sum_1_2.most_common(5) print "artical1 + artical2 =", top_five #for (key, value) in dict(top_five).items(): # refined for better performance (by reducing memory copy) for (key, value) in dict(top_five).iteritems(): print " %s = %d " % (key, value) if __name__ == "__main__": test_Counter() |
| artical1 top_five = [('sources', 4), ('when', 4), ('news', 4), ('the', 4), ('to', 3)] sources = 4 the = 4 when = 4 to = 3 news = 4 ---------- artical2 top_five = [('to', 2), ('somewhere', 1), ('sometime', 1), ('from', 1), ('somebody', 1)] somewhere = 1 to = 2 sometime = 1 from = 1 somebody = 1 ---------- artical1 + artical2 = [('to', 5), ('sources', 4), ('when', 4), ('news', 4), ('the', 4)] to = 5 news = 4 when = 4 the = 4 sources = 4 |
沒有留言:
張貼留言