Difference between revisions of "CSC111 Counting Unique Words"

From dftwiki3
Jump to: navigation, search
(Finding the most frequent words using a dictionary)
 
(6 intermediate revisions by the same user not shown)
Line 105: Line 105:
 
  elapsed time using a set (Method 2) = 0.120 seconds
 
  elapsed time using a set (Method 2) = 0.120 seconds
 
  found 45947 unique words
 
  found 45947 unique words
  elapsed time using a set = 110.810 seconds
+
  elapsed time using a list = 110.810 seconds
  
 
We find the 45,957 unique words in James Joyce's Ulysses about a thousand times faster using a set than using a list.
 
We find the 45,957 unique words in James Joyce's Ulysses about a thousand times faster using a set than using a list.
Line 112: Line 112:
 
<br />
 
<br />
 
<br />
 
<br />
 +
=Finding the most frequent words using a dictionary=
 
<br />
 
<br />
 +
<source lang="python">
 +
# countUniqueAndFrequencyofWords.py
 +
# D. Thiebaut
 +
# Demonstrates the difference in time complexity between sets, dictionary, and lists.
 +
#
 +
from urllib.request import Request
 +
from urllib.request import urlopen
 +
from time import clock
 +
 +
 +
# getWebPage(): given a URL will go grab the content of the page
 +
# and return it as a string.
 +
def getWebPage( url ):
 +
 +
    req = Request( url )
 +
 +
    print( "Requesting Web file at", url )
 +
 +
    encoding = 'latin-1'  # can sometimes be 'utf-8'
 +
    text = urlopen( req ).read().decode( encoding )
 +
   
 +
    print( "Done! Received %d characters." % len( text ) )
 +
 +
    return text
 +
 +
 +
# countUniqueWordsWithList(): given a string will return
 +
# the number of unique words in the string using a list
 +
# of unique words.  The "in" operator is used to check if
 +
# a new word is already in the list or not.
 +
def countUniqueWordsWithList( text ):
 +
    words = []
 +
    for word in text.lower().split():
 +
        word = word.strip()
 +
        if word not in words:
 +
            words.append( word )
 +
    print( "found %d unique words" % len( words ) )
 +
 +
 +
# countUniqueWordsWithSet(): given a string will return
 +
# the number of unique words in the string using a set
 +
# of unique words.  The "in" operator is used to check if
 +
# a new word is already in the set or not.
 +
def countUniqueWordsWithSet1( text ):
 +
    wordsSet = set( [] )
 +
    for word in text.lower().split():
 +
        word = word.strip()
 +
        wordsSet.add( word )
 +
    print( "found %d unique words" % len( wordsSet ) )
 +
 +
def countUniqueWordsWithSet2( text ):
 +
    wordsSet = set( text.lower().split() )
 +
    print( "found %d unique words" % len( wordsSet ) )
 +
 +
# counts the frequency of occurrence of every word in the
 +
# text
 +
def countFrequencies( text ):
 +
    # create dictionary: key = word, value = count
 +
    freq = {} 
 +
 +
    # count each word in the text
 +
    for word in text.split():
 +
        try:
 +
            freq[word] += 1
 +
        except KeyError:
 +
            freq[word] = 1
 +
 +
    tuples = [ (count,word) for word,count in freq.items() ]
 +
    tuples.sort()
 +
    return tuples[-10:]
 +
       
 +
           
 +
# main(): gets James Joyce's Ulysses from a Web page, and
 +
# measures the time it takes to count the unique words in
 +
# the book using lists, and using sets.
 +
def main():
 +
    url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
 +
    text = getWebPage( url )
 +
    print( "Number of words in document = %d\n\n" % len( text.split() ) )
 +
 +
    # find the most frequent word in the text
 +
    start = clock()
 +
    tuples = countFrequencies( text )
 +
    end  = clock()
 +
    for count, word in tuples:
 +
        print( "most frequent top-10 word \"%5s\" appeared %d times" % (word, count) )
 +
 +
    print( "\nelapsed time using a dictionary = %1.3f seconds\n" % (end-start ) )
 +
 +
   
 +
    # count the number of unique words using a set: Method 1 
 +
    start = clock()
 +
    countUniqueWordsWithSet1( text )
 +
    end  = clock()
 +
    print( "elapsed time using a set (Method 1)= %1.3f seconds\n" % (end-start ) )
 +
 +
    # count the number of unique words using a set: Method 2
 +
    start = clock()
 +
    countUniqueWordsWithSet2( text )
 +
    end  = clock()
 +
    print( "elapsed time using a set (Method 2) = %1.3f seconds\n" % (end-start ) )
 +
 +
    # count the number of unique words using a list
 +
    start = clock()
 +
    countUniqueWordsWithList( text )
 +
    end  = clock()
 +
    print( "elapsed time using a list = %1.3f seconds\n" % (end-start ))
 +
 +
 +
main()
 +
 +
</source>
 
<br />
 
<br />
 +
 +
==Output==
 
<br />
 
<br />
 +
<source lang="text">
 +
 +
Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
 +
Done! Received 1573082 characters.
 +
Number of words in document = 267980
 +
 +
 +
most frequent top-10 word " with" appeared 2391 times
 +
most frequent top-10 word "    I" appeared 2432 times
 +
most frequent top-10 word "  he" appeared 2712 times
 +
most frequent top-10 word "  his" appeared 3035 times
 +
most frequent top-10 word "  in" appeared 4606 times
 +
most frequent top-10 word "  to" appeared 4787 times
 +
most frequent top-10 word "    a" appeared 5842 times
 +
most frequent top-10 word "  and" appeared 6542 times
 +
most frequent top-10 word "  of" appeared 8127 times
 +
most frequent top-10 word "  the" appeared 13600 times
 +
 +
elapsed time using a dictionary = 0.262 seconds
 +
 +
found 45947 unique words
 +
elapsed time using a set (Method 1)= 0.170 seconds
 +
 +
found 45947 unique words
 +
elapsed time using a set (Method 2) = 0.111 seconds
 +
 +
found 45947 unique words
 +
elapsed time using a list = 70.772 seconds
 +
 +
</source>
 
<br />
 
<br />
 +
 
<br />
 
<br />
 
<br />
 
<br />

Latest revision as of 14:54, 28 April 2014

--D. Thiebaut (talk) 10:37, 24 April 2014 (EDT)



Counting Unique Words in a Document Using Sets and Lists


# countUniqueWords.py
# D. Thiebaut
# Demonstrates the difference in time complexity between sets and lists.
#
from urllib.request import Request
from urllib.request import urlopen
from time import clock


# getWebPage(): given a URL will go grab the content of the page
# and return it as a string.
def getWebPage( url ):

    req = Request( url )

    print( "Requesting Web file at", url )

    encoding = 'latin-1'  # can sometimes be 'utf-8'
    text = urlopen( req ).read().decode( encoding )
    
    print( "Done! Received %d characters." % len( text ) )

    return text


# countUniqueWordsWithList(): given a string will return
# the number of unique words in the string using a list
# of unique words.  The "in" operator is used to check if
# a new word is already in the list or not.
def countUniqueWordsWithList( text ):
    words = []
    for word in text.lower().split():
        word = word.strip()
        if word not in words:
            words.append( word )
    print( "found %d unique words" % len( words ) )


# countUniqueWordsWithSet(): given a string will return
# the number of unique words in the string using a set
# of unique words.  The "in" operator is used to check if
# a new word is already in the set or not.
def countUniqueWordsWithSet1( text ):
    wordsSet = set( [] )
    for word in text.lower().split():
        word = word.strip()
        wordsSet.add( word )
    print( "found %d unique words" % len( wordsSet ) )

def countUniqueWordsWithSet2( text ):
    wordsSet = set( text.lower().split() )
    print( "found %d unique words" % len( wordsSet ) )


# main(): gets James Joyce's Ulysses from a Web page, and
# measures the time it takes to count the unique words in
# the book using lists, and using sets.
def main():
    url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
    text = getWebPage( url )
    print( "Number of words in document = ", len( text.split() ) )

    # count the number of unique words using a set: Method 1  
    start = clock()
    countUniqueWordsWithSet1( text )
    end   = clock()
    print( "elapsed time using a set (Method 1)= %1.3f seconds" % (end-start ) )

    # count the number of unique words using a set: Method 2
    start = clock()
    countUniqueWordsWithSet2( text )
    end   = clock()
    print( "elapsed time using a set (Method 2) = %1.3f seconds" % (end-start ) )

    # count the number of unique words using a list
    start = clock()
    countUniqueWordsWithList( text )
    end   = clock()
    print( "elapsed time using a list = %1.3f seconds" % (end-start ))


main()



Output


Running the code above on a MacBook Pro circa 2011 yields the following execution time:

Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
Done! Received 1573082 characters.
Number of words in document =  267980

found 45947 unique words
elapsed time using a set (Method 1)= 0.230 seconds
found 45947 unique words
elapsed time using a set (Method 2) = 0.120 seconds
found 45947 unique words
elapsed time using a list = 110.810 seconds

We find the 45,957 unique words in James Joyce's Ulysses about a thousand times faster using a set than using a list.




Finding the most frequent words using a dictionary


# countUniqueAndFrequencyofWords.py
# D. Thiebaut
# Demonstrates the difference in time complexity between sets, dictionary, and lists.
#
from urllib.request import Request
from urllib.request import urlopen
from time import clock


# getWebPage(): given a URL will go grab the content of the page
# and return it as a string.
def getWebPage( url ):

    req = Request( url )

    print( "Requesting Web file at", url )

    encoding = 'latin-1'  # can sometimes be 'utf-8'
    text = urlopen( req ).read().decode( encoding )
    
    print( "Done! Received %d characters." % len( text ) )

    return text


# countUniqueWordsWithList(): given a string will return
# the number of unique words in the string using a list
# of unique words.  The "in" operator is used to check if
# a new word is already in the list or not.
def countUniqueWordsWithList( text ):
    words = []
    for word in text.lower().split():
        word = word.strip()
        if word not in words:
            words.append( word )
    print( "found %d unique words" % len( words ) )


# countUniqueWordsWithSet(): given a string will return
# the number of unique words in the string using a set
# of unique words.  The "in" operator is used to check if
# a new word is already in the set or not.
def countUniqueWordsWithSet1( text ):
    wordsSet = set( [] )
    for word in text.lower().split():
        word = word.strip()
        wordsSet.add( word )
    print( "found %d unique words" % len( wordsSet ) )

def countUniqueWordsWithSet2( text ):
    wordsSet = set( text.lower().split() )
    print( "found %d unique words" % len( wordsSet ) )

# counts the frequency of occurrence of every word in the 
# text
def countFrequencies( text ):
    # create dictionary: key = word, value = count 
    freq = {}  

    # count each word in the text
    for word in text.split():
        try:
            freq[word] += 1
        except KeyError:
            freq[word] = 1

    tuples = [ (count,word) for word,count in freq.items() ]
    tuples.sort()
    return tuples[-10:]
        
            
# main(): gets James Joyce's Ulysses from a Web page, and
# measures the time it takes to count the unique words in
# the book using lists, and using sets.
def main():
    url = "http://cs.smith.edu/~thiebaut/111b/4300-8.txt"
    text = getWebPage( url )
    print( "Number of words in document = %d\n\n" % len( text.split() ) )

    # find the most frequent word in the text
    start = clock()
    tuples = countFrequencies( text )
    end   = clock()
    for count, word in tuples:
        print( "most frequent top-10 word \"%5s\" appeared %d times" % (word, count) )

    print( "\nelapsed time using a dictionary = %1.3f seconds\n" % (end-start ) )

    
    # count the number of unique words using a set: Method 1  
    start = clock()
    countUniqueWordsWithSet1( text )
    end   = clock()
    print( "elapsed time using a set (Method 1)= %1.3f seconds\n" % (end-start ) )

    # count the number of unique words using a set: Method 2
    start = clock()
    countUniqueWordsWithSet2( text )
    end   = clock()
    print( "elapsed time using a set (Method 2) = %1.3f seconds\n" % (end-start ) )

    # count the number of unique words using a list
    start = clock()
    countUniqueWordsWithList( text )
    end   = clock()
    print( "elapsed time using a list = %1.3f seconds\n" % (end-start ))


main()


Output


Requesting Web file at http://cs.smith.edu/~thiebaut/111b/4300-8.txt
Done! Received 1573082 characters.
Number of words in document = 267980


most frequent top-10 word " with" appeared 2391 times
most frequent top-10 word "    I" appeared 2432 times
most frequent top-10 word "   he" appeared 2712 times
most frequent top-10 word "  his" appeared 3035 times
most frequent top-10 word "   in" appeared 4606 times
most frequent top-10 word "   to" appeared 4787 times
most frequent top-10 word "    a" appeared 5842 times
most frequent top-10 word "  and" appeared 6542 times
most frequent top-10 word "   of" appeared 8127 times
most frequent top-10 word "  the" appeared 13600 times

elapsed time using a dictionary = 0.262 seconds

found 45947 unique words
elapsed time using a set (Method 1)= 0.170 seconds

found 45947 unique words
elapsed time using a set (Method 2) = 0.111 seconds

found 45947 unique words
elapsed time using a list = 70.772 seconds