Kata Zen: 006 Python Cookbook 01-12 Most Frequently Items

#!/usr/bin/env python

# http://chimera.labs.oreilly.com/books/1230000000393/ch01.html#_determining_the_most_frequently_occurring_items_in_a_sequence
# 1.12 determining the most frequently occurring items in a sequence

from collections import Counter

def test_Counter():
artical1 = [
'As', 'a', 'news', 'aggregator', 'site', 'Google', 'uses',
'its', 'own', 'software', 'to', 'determine', 'which',
'stories', 'to', 'show', 'from', 'the', 'online', 'news',
'sources', 'it', 'watches', 'Human', 'editorial', 'input',
'does', 'come', 'into', 'the', 'system,', 'however,', 'in',
'choosing', 'exactly', 'which', 'sources', 'Google', 'News',
'will', 'pick', 'from.', 'This', 'is', 'where', 'some', 'of',
'the', 'controversy', 'over', 'Google', 'News', 'originates,',
'when', 'some', 'news', 'sources', 'are', 'included', 'when',
'visitors', 'feel', 'they', 'dont', 'deserve', 'it', 'and',
'when', 'other', 'news', 'sources', 'are', 'excluded', 'when',
'visitors', 'feel', 'they', 'ought', 'to', 'be', 'included',
'For', 'examples,', 'see', 'the', 'above', 'mentions', 'of',
'Indymedia,', 'or', 'National',
]
artical2 = [
'from', 'somewhere', 'to', 'sometime', 'to', 'somebody'
]

# Counter works on all sequence of hashable inout items
word_counts_1 = Counter(artical1)
top_five = word_counts_1.most_common(5)
print "artical1 top_five =", top_five
for (key, value) in dict(top_five).items():
print " %s = %d " % (key, value)

print "----------"
word_counts_2 = Counter(artical2)
top_five = word_counts_2.most_common(5)
print "artical2 top_five =", top_five
for (key, value) in dict(top_five).items():
print " %s = %d " % (key, value)

print "----------"
sum_1_2 = word_counts_1 + word_counts_2
top_five = sum_1_2.most_common(5)
print "artical1 + artical2 =", top_five

#for (key, value) in dict(top_five).items():
# refined for better performance (by reducing memory copy)
for (key, value) in dict(top_five).iteritems():
print " %s = %d " % (key, value)

if __name__ == "__main__":

test_Counter()

artical1 top_five = [('sources', 4), ('when', 4), ('news', 4), ('the', 4), ('to', 3)]
sources = 4
the = 4
when = 4
to = 3
news = 4
----------
artical2 top_five = [('to', 2), ('somewhere', 1), ('sometime', 1), ('from', 1), ('somebody', 1)]
somewhere = 1
to = 2
sometime = 1
from = 1
somebody = 1
----------
artical1 + artical2 = [('to', 5), ('sources', 4), ('when', 4), ('news', 4), ('the', 4)]
to = 5
news = 4
when = 4
the = 4
sources = 4

Kata Zen

2014年4月8日星期二

006 Python Cookbook 01-12 Most Frequently Items

沒有留言:

張貼留言

2014年4月8日 星期二

006 Python Cookbook 01-12 Most Frequently Items

沒有留言:

張貼留言

2014年4月8日星期二