import nltk
def getData(src):
'''Read a list of words from the given source file (src).
Each word sits on a separate line in the file.'''
raw = open(src).read()
return raw.split()
def max_match(word, word_list) :
start = 0
words = []
while start < len(word):
match = False
for i in range(len(word), 0, -1) :
if (word[start:i] in word_list) :
words.append(word[start:i])
match = True
start = i
break
if not match :
words.append(word[start])
start += 1
return words
# 1. Create three dictionaries
nltk_words = nltk.corpus.words.words() # The NLTK wordlist corpus
unix_words = getData("UsrDictWords.txt") # UNIX /usr/dict/words
google_words = getData("bw.txt") # Google's list of most freq. words
# preprocess all upper-case letters to lowercase
nltk_words = [w.lower() for w in nltk_words]
unix_words = [w.lower() for w in unix_words]
google_words = [w.lower() for w in google_words]
google_words = google_words[:75000] # only keep the first 75k words
print "All word lists loaded:"
print "NLTK Wordlist corpus: %1d words." % (len(nltk_words))
print "UNIX /usr/dict/words: %1d words." % (len(unix_words))
print "Google's most frequent words: %1d words." % (len(google_words))
def main():
# 2. Read in the test hashtags
test_tags = getData("hashtags-test.txt")
# 3. Read in the gold standard segmentation
raw = open("hashtags-answers.txt").read()
gold_std = raw.split("\n")
# 4. process and print results for comparison
for tag in test_tags:
print "#%s" % tag
r1 = max_match(tag, nltk_words)
r2 = max_match(tag, unix_words)
r3 = max_match(tag, google_words)
print "NLTK:%s\nUNIX:%s\nGoogle:%s" %\
(" ".join(r1), " ".join(r2), " ".join(r3))
print "-------------------------------------------------"