Difference between revisions of "CSC111 Counting Unique Words"
(→Counting Unique Words in a Document Using Sets and Lists) |
(→Finding the most frequent words using a dictionary) |
||
(11 intermediate revisions by the same user not shown) | |||
Line 7: | Line 7: | ||
# countUniqueWords.py | # countUniqueWords.py | ||
# D. Thiebaut | # D. Thiebaut | ||
+ | # Demonstrates the difference in time complexity between sets and lists. | ||
# | # | ||
from urllib.request import Request | from urllib.request import Request | ||
Line 46: | Line 47: | ||
# of unique words. The "in" operator is used to check if | # of unique words. The "in" operator is used to check if | ||
# a new word is already in the set or not. | # a new word is already in the set or not. | ||
− | def | + | def countUniqueWordsWithSet1( text ): |
wordsSet = set( [] ) | wordsSet = set( [] ) | ||
for word in text.lower().split(): | for word in text.lower().split(): | ||
word = word.strip() | word = word.strip() | ||
− | + | wordsSet.add( word ) | |
− | |||
print( "found %d unique words" % len( wordsSet ) ) | print( "found %d unique words" % len( wordsSet ) ) | ||
+ | def countUniqueWordsWithSet2( text ): | ||
+ | wordsSet = set( text.lower().split() ) | ||
+ | print( "found %d unique words" % len( wordsSet ) ) | ||
Line 62: | Line 65: | ||
url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt" | url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt" | ||
text = getWebPage( url ) | text = getWebPage( url ) | ||
+ | print( "Number of words in document = ", len( text.split() ) ) | ||
+ | |||
+ | # count the number of unique words using a set: Method 1 | ||
+ | start = clock() | ||
+ | countUniqueWordsWithSet1( text ) | ||
+ | end = clock() | ||
+ | print( "elapsed time using a set (Method 1)= %1.3f seconds" % (end-start ) ) | ||
− | # count the number of unique words using a set | + | # count the number of unique words using a set: Method 2 |
start = clock() | start = clock() | ||
− | + | countUniqueWordsWithSet2( text ) | |
end = clock() | end = clock() | ||
− | print( "elapsed time using a set = %1.3f seconds" % (end-start ) ) | + | print( "elapsed time using a set (Method 2) = %1.3f seconds" % (end-start ) ) |
# count the number of unique words using a list | # count the number of unique words using a list | ||
Line 82: | Line 92: | ||
<br /> | <br /> | ||
<br /> | <br /> | ||
+ | ==Output== | ||
<br /> | <br /> | ||
+ | Running the code above on a MacBook Pro circa 2011 yields the following execution time: | ||
+ | |||
+ | Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt | ||
+ | Done! Received 1573082 characters. | ||
+ | Number of words in document = 267980 | ||
+ | |||
+ | found 45947 unique words | ||
+ | elapsed time using a set (Method 1)= 0.230 seconds | ||
+ | found 45947 unique words | ||
+ | elapsed time using a set (Method 2) = 0.120 seconds | ||
+ | found 45947 unique words | ||
+ | elapsed time using a list = 110.810 seconds | ||
+ | |||
+ | We find the 45,957 unique words in James Joyce's Ulysses about a thousand times faster using a set than using a list. | ||
+ | |||
+ | |||
<br /> | <br /> | ||
<br /> | <br /> | ||
+ | =Finding the most frequent words using a dictionary= | ||
<br /> | <br /> | ||
+ | <source lang="python"> | ||
+ | # countUniqueAndFrequencyofWords.py | ||
+ | # D. Thiebaut | ||
+ | # Demonstrates the difference in time complexity between sets, dictionary, and lists. | ||
+ | # | ||
+ | from urllib.request import Request | ||
+ | from urllib.request import urlopen | ||
+ | from time import clock | ||
+ | |||
+ | |||
+ | # getWebPage(): given a URL will go grab the content of the page | ||
+ | # and return it as a string. | ||
+ | def getWebPage( url ): | ||
+ | |||
+ | req = Request( url ) | ||
+ | |||
+ | print( "Requesting Web file at", url ) | ||
+ | |||
+ | encoding = 'latin-1' # can sometimes be 'utf-8' | ||
+ | text = urlopen( req ).read().decode( encoding ) | ||
+ | |||
+ | print( "Done! Received %d characters." % len( text ) ) | ||
+ | |||
+ | return text | ||
+ | |||
+ | |||
+ | # countUniqueWordsWithList(): given a string will return | ||
+ | # the number of unique words in the string using a list | ||
+ | # of unique words. The "in" operator is used to check if | ||
+ | # a new word is already in the list or not. | ||
+ | def countUniqueWordsWithList( text ): | ||
+ | words = [] | ||
+ | for word in text.lower().split(): | ||
+ | word = word.strip() | ||
+ | if word not in words: | ||
+ | words.append( word ) | ||
+ | print( "found %d unique words" % len( words ) ) | ||
+ | |||
+ | |||
+ | # countUniqueWordsWithSet(): given a string will return | ||
+ | # the number of unique words in the string using a set | ||
+ | # of unique words. The "in" operator is used to check if | ||
+ | # a new word is already in the set or not. | ||
+ | def countUniqueWordsWithSet1( text ): | ||
+ | wordsSet = set( [] ) | ||
+ | for word in text.lower().split(): | ||
+ | word = word.strip() | ||
+ | wordsSet.add( word ) | ||
+ | print( "found %d unique words" % len( wordsSet ) ) | ||
+ | |||
+ | def countUniqueWordsWithSet2( text ): | ||
+ | wordsSet = set( text.lower().split() ) | ||
+ | print( "found %d unique words" % len( wordsSet ) ) | ||
+ | |||
+ | # counts the frequency of occurrence of every word in the | ||
+ | # text | ||
+ | def countFrequencies( text ): | ||
+ | # create dictionary: key = word, value = count | ||
+ | freq = {} | ||
+ | |||
+ | # count each word in the text | ||
+ | for word in text.split(): | ||
+ | try: | ||
+ | freq[word] += 1 | ||
+ | except KeyError: | ||
+ | freq[word] = 1 | ||
+ | |||
+ | tuples = [ (count,word) for word,count in freq.items() ] | ||
+ | tuples.sort() | ||
+ | return tuples[-10:] | ||
+ | |||
+ | |||
+ | # main(): gets James Joyce's Ulysses from a Web page, and | ||
+ | # measures the time it takes to count the unique words in | ||
+ | # the book using lists, and using sets. | ||
+ | def main(): | ||
+ | url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt" | ||
+ | text = getWebPage( url ) | ||
+ | print( "Number of words in document = %d\n\n" % len( text.split() ) ) | ||
+ | |||
+ | # find the most frequent word in the text | ||
+ | start = clock() | ||
+ | tuples = countFrequencies( text ) | ||
+ | end = clock() | ||
+ | for count, word in tuples: | ||
+ | print( "most frequent top-10 word \"%5s\" appeared %d times" % (word, count) ) | ||
+ | |||
+ | print( "\nelapsed time using a dictionary = %1.3f seconds\n" % (end-start ) ) | ||
+ | |||
+ | |||
+ | # count the number of unique words using a set: Method 1 | ||
+ | start = clock() | ||
+ | countUniqueWordsWithSet1( text ) | ||
+ | end = clock() | ||
+ | print( "elapsed time using a set (Method 1)= %1.3f seconds\n" % (end-start ) ) | ||
+ | |||
+ | # count the number of unique words using a set: Method 2 | ||
+ | start = clock() | ||
+ | countUniqueWordsWithSet2( text ) | ||
+ | end = clock() | ||
+ | print( "elapsed time using a set (Method 2) = %1.3f seconds\n" % (end-start ) ) | ||
+ | |||
+ | # count the number of unique words using a list | ||
+ | start = clock() | ||
+ | countUniqueWordsWithList( text ) | ||
+ | end = clock() | ||
+ | print( "elapsed time using a list = %1.3f seconds\n" % (end-start )) | ||
+ | |||
+ | |||
+ | main() | ||
+ | |||
+ | </source> | ||
<br /> | <br /> | ||
+ | |||
+ | ==Output== | ||
<br /> | <br /> | ||
+ | <source lang="text"> | ||
+ | |||
+ | Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt | ||
+ | Done! Received 1573082 characters. | ||
+ | Number of words in document = 267980 | ||
+ | |||
+ | |||
+ | most frequent top-10 word " with" appeared 2391 times | ||
+ | most frequent top-10 word " I" appeared 2432 times | ||
+ | most frequent top-10 word " he" appeared 2712 times | ||
+ | most frequent top-10 word " his" appeared 3035 times | ||
+ | most frequent top-10 word " in" appeared 4606 times | ||
+ | most frequent top-10 word " to" appeared 4787 times | ||
+ | most frequent top-10 word " a" appeared 5842 times | ||
+ | most frequent top-10 word " and" appeared 6542 times | ||
+ | most frequent top-10 word " of" appeared 8127 times | ||
+ | most frequent top-10 word " the" appeared 13600 times | ||
+ | |||
+ | elapsed time using a dictionary = 0.262 seconds | ||
+ | |||
+ | found 45947 unique words | ||
+ | elapsed time using a set (Method 1)= 0.170 seconds | ||
+ | |||
+ | found 45947 unique words | ||
+ | elapsed time using a set (Method 2) = 0.111 seconds | ||
+ | |||
+ | found 45947 unique words | ||
+ | elapsed time using a list = 70.772 seconds | ||
+ | |||
+ | </source> | ||
<br /> | <br /> | ||
+ | |||
<br /> | <br /> | ||
<br /> | <br /> |
Latest revision as of 14:54, 28 April 2014
--D. Thiebaut (talk) 10:37, 24 April 2014 (EDT)
Contents
Counting Unique Words in a Document Using Sets and Lists
# countUniqueWords.py
# D. Thiebaut
# Demonstrates the difference in time complexity between sets and lists.
#
from urllib.request import Request
from urllib.request import urlopen
from time import clock
# getWebPage(): given a URL will go grab the content of the page
# and return it as a string.
def getWebPage( url ):
req = Request( url )
print( "Requesting Web file at", url )
encoding = 'latin-1' # can sometimes be 'utf-8'
text = urlopen( req ).read().decode( encoding )
print( "Done! Received %d characters." % len( text ) )
return text
# countUniqueWordsWithList(): given a string will return
# the number of unique words in the string using a list
# of unique words. The "in" operator is used to check if
# a new word is already in the list or not.
def countUniqueWordsWithList( text ):
words = []
for word in text.lower().split():
word = word.strip()
if word not in words:
words.append( word )
print( "found %d unique words" % len( words ) )
# countUniqueWordsWithSet(): given a string will return
# the number of unique words in the string using a set
# of unique words. The "in" operator is used to check if
# a new word is already in the set or not.
def countUniqueWordsWithSet1( text ):
wordsSet = set( [] )
for word in text.lower().split():
word = word.strip()
wordsSet.add( word )
print( "found %d unique words" % len( wordsSet ) )
def countUniqueWordsWithSet2( text ):
wordsSet = set( text.lower().split() )
print( "found %d unique words" % len( wordsSet ) )
# main(): gets James Joyce's Ulysses from a Web page, and
# measures the time it takes to count the unique words in
# the book using lists, and using sets.
def main():
url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
text = getWebPage( url )
print( "Number of words in document = ", len( text.split() ) )
# count the number of unique words using a set: Method 1
start = clock()
countUniqueWordsWithSet1( text )
end = clock()
print( "elapsed time using a set (Method 1)= %1.3f seconds" % (end-start ) )
# count the number of unique words using a set: Method 2
start = clock()
countUniqueWordsWithSet2( text )
end = clock()
print( "elapsed time using a set (Method 2) = %1.3f seconds" % (end-start ) )
# count the number of unique words using a list
start = clock()
countUniqueWordsWithList( text )
end = clock()
print( "elapsed time using a list = %1.3f seconds" % (end-start ))
main()
Output
Running the code above on a MacBook Pro circa 2011 yields the following execution time:
Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt Done! Received 1573082 characters. Number of words in document = 267980 found 45947 unique words elapsed time using a set (Method 1)= 0.230 seconds found 45947 unique words elapsed time using a set (Method 2) = 0.120 seconds found 45947 unique words elapsed time using a list = 110.810 seconds
We find the 45,957 unique words in James Joyce's Ulysses about a thousand times faster using a set than using a list.
Finding the most frequent words using a dictionary
# countUniqueAndFrequencyofWords.py
# D. Thiebaut
# Demonstrates the difference in time complexity between sets, dictionary, and lists.
#
from urllib.request import Request
from urllib.request import urlopen
from time import clock
# getWebPage(): given a URL will go grab the content of the page
# and return it as a string.
def getWebPage( url ):
req = Request( url )
print( "Requesting Web file at", url )
encoding = 'latin-1' # can sometimes be 'utf-8'
text = urlopen( req ).read().decode( encoding )
print( "Done! Received %d characters." % len( text ) )
return text
# countUniqueWordsWithList(): given a string will return
# the number of unique words in the string using a list
# of unique words. The "in" operator is used to check if
# a new word is already in the list or not.
def countUniqueWordsWithList( text ):
words = []
for word in text.lower().split():
word = word.strip()
if word not in words:
words.append( word )
print( "found %d unique words" % len( words ) )
# countUniqueWordsWithSet(): given a string will return
# the number of unique words in the string using a set
# of unique words. The "in" operator is used to check if
# a new word is already in the set or not.
def countUniqueWordsWithSet1( text ):
wordsSet = set( [] )
for word in text.lower().split():
word = word.strip()
wordsSet.add( word )
print( "found %d unique words" % len( wordsSet ) )
def countUniqueWordsWithSet2( text ):
wordsSet = set( text.lower().split() )
print( "found %d unique words" % len( wordsSet ) )
# counts the frequency of occurrence of every word in the
# text
def countFrequencies( text ):
# create dictionary: key = word, value = count
freq = {}
# count each word in the text
for word in text.split():
try:
freq[word] += 1
except KeyError:
freq[word] = 1
tuples = [ (count,word) for word,count in freq.items() ]
tuples.sort()
return tuples[-10:]
# main(): gets James Joyce's Ulysses from a Web page, and
# measures the time it takes to count the unique words in
# the book using lists, and using sets.
def main():
url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
text = getWebPage( url )
print( "Number of words in document = %d\n\n" % len( text.split() ) )
# find the most frequent word in the text
start = clock()
tuples = countFrequencies( text )
end = clock()
for count, word in tuples:
print( "most frequent top-10 word \"%5s\" appeared %d times" % (word, count) )
print( "\nelapsed time using a dictionary = %1.3f seconds\n" % (end-start ) )
# count the number of unique words using a set: Method 1
start = clock()
countUniqueWordsWithSet1( text )
end = clock()
print( "elapsed time using a set (Method 1)= %1.3f seconds\n" % (end-start ) )
# count the number of unique words using a set: Method 2
start = clock()
countUniqueWordsWithSet2( text )
end = clock()
print( "elapsed time using a set (Method 2) = %1.3f seconds\n" % (end-start ) )
# count the number of unique words using a list
start = clock()
countUniqueWordsWithList( text )
end = clock()
print( "elapsed time using a list = %1.3f seconds\n" % (end-start ))
main()
Output
Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
Done! Received 1573082 characters.
Number of words in document = 267980
most frequent top-10 word " with" appeared 2391 times
most frequent top-10 word " I" appeared 2432 times
most frequent top-10 word " he" appeared 2712 times
most frequent top-10 word " his" appeared 3035 times
most frequent top-10 word " in" appeared 4606 times
most frequent top-10 word " to" appeared 4787 times
most frequent top-10 word " a" appeared 5842 times
most frequent top-10 word " and" appeared 6542 times
most frequent top-10 word " of" appeared 8127 times
most frequent top-10 word " the" appeared 13600 times
elapsed time using a dictionary = 0.262 seconds
found 45947 unique words
elapsed time using a set (Method 1)= 0.170 seconds
found 45947 unique words
elapsed time using a set (Method 2) = 0.111 seconds
found 45947 unique words
elapsed time using a list = 70.772 seconds