import nltk 
def getData(src):
   '''Read a list of words from the given source file (src).
   Each word sits on a separate line in the file.'''
   raw = open(src).read()
   return raw.split()
def max_match(word, word_list) :
   start = 0
   words = []
   
   while start < len(word):
   match = False
   for i in range(len(word), 0, -1) :
   if (word[start:i] in word_list) :
   words.append(word[start:i])
   match = True
   start = i
   break
   if not match :
   words.append(word[start])
   start += 1
   return words
# 1. Create three dictionaries
   nltk_words = nltk.corpus.words.words() # The NLTK wordlist corpus
   unix_words = getData("UsrDictWords.txt") # UNIX /usr/dict/words
   google_words = getData("bw.txt") # Google's list of most freq. words
# preprocess all upper-case letters to lowercase
   nltk_words = [w.lower() for w in nltk_words]
   unix_words = [w.lower() for w in unix_words]
   google_words = [w.lower() for w in google_words]
   google_words = google_words[:75000] # only keep the first 75k words
print "All word lists loaded:"
   print "NLTK Wordlist corpus: %1d words." % (len(nltk_words))
   print "UNIX /usr/dict/words: %1d words." % (len(unix_words))
   print "Google's most frequent words: %1d words." % (len(google_words))
 
def main():
 # 2. Read in the test hashtags
   test_tags = getData("hashtags-test.txt")
   
   # 3. Read in the gold standard segmentation
   raw = open("hashtags-answers.txt").read()
   gold_std = raw.split("\n")
   
   # 4. process and print results for comparison
   for tag in test_tags:
   print "#%s" % tag
   r1 = max_match(tag, nltk_words)
   r2 = max_match(tag, unix_words)
   r3 = max_match(tag, google_words)
   print "NLTK:%s\nUNIX:%s\nGoogle:%s" %\
   (" ".join(r1), " ".join(r2), " ".join(r3))
   print "-------------------------------------------------"