Difference between revisions of "CSC111 Counting Unique Words"

From dftwiki3
Jump to: navigation, search
Line 47: Line 47:
 
# of unique words.  The "in" operator is used to check if
 
# of unique words.  The "in" operator is used to check if
 
# a new word is already in the set or not.
 
# a new word is already in the set or not.
def countUniqueWordsWithSet( text ):
+
def countUniqueWordsWithSet1( text ):
 
     wordsSet = set( [] )
 
     wordsSet = set( [] )
 
     for word in text.lower().split():
 
     for word in text.lower().split():
 
         word = word.strip()
 
         word = word.strip()
         if word not in wordsSet:
+
         wordsSet.add( word )
            wordsSet.add( word )
 
 
     print( "found %d unique words" % len( wordsSet ) )
 
     print( "found %d unique words" % len( wordsSet ) )
  
 +
def countUniqueWordsWithSet2( text ):
 +
    wordsSet = set( text.lower().split() )
 +
    print( "found %d unique words" % len( wordsSet ) )
  
  
Line 64: Line 66:
 
     text = getWebPage( url )
 
     text = getWebPage( url )
  
     # count the number of unique words using a set  
+
     # count the number of unique words using a set: Method 1 
 +
    start = clock()
 +
    countUniqueWordsWithSet1( text )
 +
    end  = clock()
 +
    print( "elapsed time using a set (Method 1)= %1.3f seconds" % (end-start ) )
 +
 
 +
    # count the number of unique words using a set: Method 2
 
     start = clock()
 
     start = clock()
     countUniqueWordsWithSet( text )
+
     countUniqueWordsWithSet2( text )
 
     end  = clock()
 
     end  = clock()
     print( "elapsed time using a set = %1.3f seconds" % (end-start ) )
+
     print( "elapsed time using a set (Method 2) = %1.3f seconds" % (end-start ) )
  
 
     # count the number of unique words using a list
 
     # count the number of unique words using a list
Line 85: Line 93:
 
==Output==
 
==Output==
 
<br />
 
<br />
<!--
+
 
  Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
 
  Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
 
  Done! Received 1573082 characters.
 
  Done! Received 1573082 characters.
 
  found 45947 unique words
 
  found 45947 unique words
  elapsed time using a set = 0.158 seconds
+
  elapsed time using a set (Method 1)= 0.230 seconds
 +
found 45947 unique words
 +
elapsed time using a set (Method 2) = 0.120 seconds
 
  found 45947 unique words
 
  found 45947 unique words
 
  elapsed time using a set = 65.316 seconds
 
  elapsed time using a set = 65.316 seconds
 
   
 
   
-->
+
 
<br />
 
<br />
 
<br />
 
<br />

Revision as of 14:58, 26 April 2014

--D. Thiebaut (talk) 10:37, 24 April 2014 (EDT)



Counting Unique Words in a Document Using Sets and Lists


# countUniqueWords.py
# D. Thiebaut
# Demonstrates the difference in time complexity between sets and lists.
#
from urllib.request import Request
from urllib.request import urlopen
from time import clock


# getWebPage(): given a URL will go grab the content of the page
# and return it as a string.
def getWebPage( url ):

    req = Request( url )

    print( "Requesting Web file at", url )

    encoding = 'latin-1'  # can sometimes be 'utf-8'
    text = urlopen( req ).read().decode( encoding )
    
    print( "Done! Received %d characters." % len( text ) )

    return text


# countUniqueWordsWithList(): given a string will return
# the number of unique words in the string using a list
# of unique words.  The "in" operator is used to check if
# a new word is already in the list or not.
def countUniqueWordsWithList( text ):
    words = []
    for word in text.lower().split():
        word = word.strip()
        if word not in words:
            words.append( word )
    print( "found %d unique words" % len( words ) )


# countUniqueWordsWithSet(): given a string will return
# the number of unique words in the string using a set
# of unique words.  The "in" operator is used to check if
# a new word is already in the set or not.
def countUniqueWordsWithSet1( text ):
    wordsSet = set( [] )
    for word in text.lower().split():
        word = word.strip()
        wordsSet.add( word )
    print( "found %d unique words" % len( wordsSet ) )

def countUniqueWordsWithSet2( text ):
    wordsSet = set( text.lower().split() )
    print( "found %d unique words" % len( wordsSet ) )


# main(): gets James Joyce's Ulysses from a Web page, and
# measures the time it takes to count the unique words in
# the book using lists, and using sets.
def main():
    url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
    text = getWebPage( url )

    # count the number of unique words using a set: Method 1  
    start = clock()
    countUniqueWordsWithSet1( text )
    end   = clock()
    print( "elapsed time using a set (Method 1)= %1.3f seconds" % (end-start ) )

    # count the number of unique words using a set: Method 2
    start = clock()
    countUniqueWordsWithSet2( text )
    end   = clock()
    print( "elapsed time using a set (Method 2) = %1.3f seconds" % (end-start ) )

    # count the number of unique words using a list
    start = clock()
    countUniqueWordsWithList( text )
    end   = clock()
    print( "elapsed time using a list = %1.3f seconds" % (end-start ))


main()



Output


Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
Done! Received 1573082 characters.
found 45947 unique words
elapsed time using a set (Method 1)= 0.230 seconds
found 45947 unique words
elapsed time using a set (Method 2) = 0.120 seconds
found 45947 unique words
elapsed time using a set = 65.316 seconds