CSC111 Counting Unique Words
--D. Thiebaut (talk) 10:37, 24 April 2014 (EDT)
Counting Unique Words in a Document Using Sets and Lists
# countUniqueWords.py
# D. Thiebaut
#
from urllib.request import Request
from urllib.request import urlopen
from time import clock
# getWebPage(): given a URL will go grab the content of the page
# and return it as a string.
def getWebPage( url ):
req = Request( url )
print( "Requesting Web file at", url )
encoding = 'latin-1' # can sometimes be 'utf-8'
text = urlopen( req ).read().decode( encoding )
print( "Done! Received %d characters." % len( text ) )
return text
# countUniqueWordsWithList(): given a string will return
# the number of unique words in the string using a list
# of unique words. The "in" operator is used to check if
# a new word is already in the list or not.
def countUniqueWordsWithList( text ):
words = []
for word in text.lower().split():
word = word.strip()
if word not in words:
words.append( word )
print( "found %d unique words" % len( words ) )
# countUniqueWordsWithSet(): given a string will return
# the number of unique words in the string using a set
# of unique words. The "in" operator is used to check if
# a new word is already in the set or not.
def countUniqueWordsWithSet( text ):
wordsSet = set( [] )
for word in text.lower().split():
word = word.strip()
if word not in wordsSet:
wordsSet.add( word )
print( "found %d unique words" % len( wordsSet ) )
# main(): gets James Joyce's Ulysses from a Web page, and
# measures the time it takes to count the unique words in
# the book using lists, and using sets.
def main():
url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
text = getWebPage( url )
# count the number of unique words using a set
start = clock()
countUniqueWordsWithSet( text )
end = clock()
print( "elapsed time using a set = %1.3f seconds" % (end-start ) )
# count the number of unique words using a list
start = clock()
countUniqueWordsWithList( text )
end = clock()
print( "elapsed time using a list = %1.3f seconds" % (end-start ))
main()