2014年4月8日 星期二

006 Python Cookbook 01-12 Most Frequently Items

#!/usr/bin/env python

# http://chimera.labs.oreilly.com/books/1230000000393/ch01.html#_determining_the_most_frequently_occurring_items_in_a_sequence
# 1.12 determining the most frequently occurring items in a sequence


from collections import Counter

def test_Counter():
  artical1 = [
    'As', 'a', 'news', 'aggregator', 'site', 'Google', 'uses',
    'its', 'own', 'software', 'to', 'determine', 'which',
    'stories', 'to', 'show', 'from', 'the', 'online', 'news',
    'sources', 'it', 'watches', 'Human', 'editorial', 'input',
    'does', 'come', 'into', 'the', 'system,', 'however,', 'in',
    'choosing', 'exactly', 'which', 'sources', 'Google', 'News',
    'will', 'pick', 'from.', 'This', 'is', 'where', 'some', 'of',
    'the', 'controversy', 'over', 'Google', 'News', 'originates,',
    'when', 'some', 'news', 'sources', 'are', 'included', 'when',
    'visitors', 'feel', 'they', 'dont', 'deserve', 'it', 'and',
    'when', 'other', 'news', 'sources', 'are', 'excluded', 'when',
    'visitors', 'feel', 'they', 'ought', 'to', 'be', 'included',
    'For', 'examples,', 'see', 'the', 'above', 'mentions', 'of',
    'Indymedia,', 'or', 'National',
  ]
  artical2 = [
    'from', 'somewhere', 'to', 'sometime', 'to', 'somebody'
  ]
  
  # Counter works on all sequence of hashable inout items
  word_counts_1 = Counter(artical1)
  top_five = word_counts_1.most_common(5)
  print "artical1 top_five =", top_five
  for (key, value) in dict(top_five).items():
    print " %s = %d " % (key, value)
    
  print "----------"
  word_counts_2 = Counter(artical2)
  top_five = word_counts_2.most_common(5)
  print "artical2 top_five =", top_five
  for (key, value) in dict(top_five).items():
    print " %s = %d " % (key, value)
    
  print "----------"
  sum_1_2 = word_counts_1 + word_counts_2
  top_five = sum_1_2.most_common(5)
  print "artical1 + artical2 =", top_five
  
  #for (key, value) in dict(top_five).items():
  # refined for better performance (by reducing memory copy)
  for (key, value) in dict(top_five).iteritems():
    print " %s = %d " % (key, value)  

if __name__ == "__main__":

  test_Counter()

artical1 top_five = [('sources', 4), ('when', 4), ('news', 4), ('the', 4), ('to', 3)]
 sources = 4 
 the = 4 
 when = 4 
 to = 3 
 news = 4 
----------
artical2 top_five = [('to', 2), ('somewhere', 1), ('sometime', 1), ('from', 1), ('somebody', 1)]
 somewhere = 1 
 to = 2 
 sometime = 1 
 from = 1 
 somebody = 1 
----------
artical1 + artical2 = [('to', 5), ('sources', 4), ('when', 4), ('news', 4), ('the', 4)]
 to = 5 
 news = 4 
 when = 4 
 the = 4 
 sources = 4 

沒有留言:

張貼留言