Difference between revisions of "CSC352 Homework 2 Solution 1"

From dftwiki3
Jump to: navigation, search
(Created page with '--~~~~ ---- <onlydft> <source lang="python"> #! /usr/bin/python # D. Thiebaut # threadedRetrieveEtext.py # # feed key words, get list of docs, then download docs # and process t…')
 
 
(4 intermediate revisions by the same user not shown)
Line 2: Line 2:
 
----
 
----
  
<onlydft>
 
 
<source lang="python">
 
<source lang="python">
 
#! /usr/bin/python
 
#! /usr/bin/python
Line 8: Line 7:
 
# threadedRetrieveEtext.py
 
# threadedRetrieveEtext.py
 
#
 
#
# feed key words, get list of docs, then download docs
+
# This program gets a keyword from the command line, and
# and process them
+
# launches a thread that accesses a server and prompts it for
 +
# a list of 20 Urls of files containing that keyword.
 +
# The program then launches 20 threads on the 20 different threads
 +
# and parallelizes the wait of the 20 threads.
 +
# When the 20 threads are done, the main program gets the
 +
# excerpts from each file with a few words before and after the
 +
# keyword, and prints the results on the screen.
 +
 
 
import sys
 
import sys
 
import urllib, urllib2
 
import urllib, urllib2
Line 194: Line 200:
  
 
</source>
 
</source>
</onlydft>
+
 
 +
<br />
 +
<br />
 +
<br />
 +
<br />
 +
<br />
 +
<br />
 +
[[Category:CSC352]][[Category:Python]][[Category:Threads]][[Category:Homework]]

Latest revision as of 15:26, 14 April 2010

--D. Thiebaut 22:38, 22 February 2010 (UTC)


#! /usr/bin/python
# D. Thiebaut
# threadedRetrieveEtext.py
#
# This program gets a keyword from the command line, and 
# launches a thread that accesses a server and prompts it for
# a list of 20 Urls of files containing that keyword.
# The program then launches 20 threads on the 20 different threads
# and parallelizes the wait of the 20 threads.
# When the 20 threads are done, the main program gets the 
# excerpts from each file with a few words before and after the 
# keyword, and prints the results on the screen.

import sys
import urllib, urllib2
import textwrap
import threading
import Queue        # use "queue" lowercase in Python 3
import time
import random

MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
 Gecko/20071127 Firefox/2.0.0.11"""
HEADERS       = {"User-Agent"  : VERSION }



class urlRetrieverThread( threading.Thread ):
    
    def __init__( self, url, keyword, queue=None, Id=None ):
        threading.Thread.__init__( self )
        self.Id        = Id
        self.queue     = queue
        self.url       = url      # the url of the text/page to be retrieved
        self.text      = None     # the page retrieved
        self.keyword   = keyword  # the keyword to search for, if any
        self.args      = {}
        if keyword is not None:
            self.args = { "search" : keyword }
            
    def getPage( self ):
        """returns the page retrieved by this thread.  Make sure
        isAlive() returns false first, to make sure the thread has
        finished before calling this."""
        return self.text

    def getExerpt( self ):
        """returns the 1st portion of the page retrieved by this thread
        surrounding the current keyword.  Make sure
        isAlive() returns false first, to make sure the thread has
        finished before calling this."""
        if self.keyword is None or self.text is None:
            return ""
        
        offset = self.text.find( self.keyword ) 
        exerpt = self.text
        if offset == -1:
            return ""
        exerpt = self.text[ max(offset-60, 0)
                          : min( offset+60, len(self.text) )]
        
        #--- remove duplicate white spaces and wrap at 40 chars---
        exerpt = ' '.join( exerpt.split()[1:-1] )
        exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
        return exerpt

    def run( self ):        
        req = urllib2.Request( self.url,
                               urllib.urlencode( self.args ),
                               HEADERS )
        f = urllib2.urlopen( req )
        self.text = f.read()
        
        #--- Just to see what is happening, this code section
        #--- will add some additional delay to see how result information                                              
        #--- from all the threads evolved with time.                                                               
        #--- Removee this line for normal operation!
        time.sleep( random.randint( 1, 50 ) /10 )  

        #--- tell main program we're done ---
        if self.queue is not None:
            self.queue.put( self.Id )
            
def parse( text, debug=False ):
    lines = text.split( "\n" )
    list = []
    doc  = {}
    for line in lines:
        if line.find( "<br>url:" )!=-1: 
            doc["url"] = line.split( "url:" )[1].strip() 
        if line.find( "<br>score:" )!=-1: 
            doc["score"] = int( line.split( "score:" )[1].strip() ) 
        if line.find( "<br>file:" )!=-1: 
            doc["file"] = line.split( "file:" )[1].strip() 
        if line.find( "<br>offset:" )!=-1: 
            doc["offset"] = int( line.split( "offset:" )[1].strip() ) 
            list.append( doc )
            doc = {}
        
    return list


def getAllDocs( listOfDocs, keywords, debug=False ):
    for doc in listOfDocs:
        url = doc[ "url" ]
        fileName = doc[ "file" ]
        if debug:
            print "--------------------------------------------"
            print "fetching url:", url
        text = getOneDoc( url, keywords, debug )
        if debug: print text        


def goHome():
    """bring cursor to top left position on the screen"""
    sys.stderr.write( "\x1b[0;0H\x1b[2J" )

def displayResults( results ):
    """ update the list of results on the screen"""
    results.sort( )
    results.reverse( )
    goHome()
    for i in range( len( results ) ):
        score, url, exerpt = results[ i ]
        print ""
        print "%d %s" % (score, url)
        print " -- ", ' '.join( exerpt.split( "\n" ) )

  
def main( debug=False ):
    if len( sys.argv ) > 1:
       keyword = sys.argv[1]

    if debug: print "starting main thread"
    mainThread = urlRetrieverThread( MAINURL, keyword )
    mainThread.start()
    mainThread.join()

    if debug: print "main thread done!"
    
    text = mainThread.getPage()
    if debug: print "list of docs = ",text

    listOfDocs = parse( text, debug )

    #--- create a queue for communication between threads ---
    #--- and main                                          ---
    queue = Queue.Queue( 0 )
        
    #--- start all the threads ---
    listOfThreads = {}
    for id, doc in enumerate( listOfDocs ):
        url = doc[ "url" ]
        thread = urlRetrieverThread( url, keyword, queue, id )
        listOfThreads[id] = (doc, thread)
        thread.start()
        if debug: print "starting thread", id
        

    #--- wait for the threads to pass their Id back through the queue---
    results = []
    count = 0
    while True:
        #--- get the Id of the thread that is done ---
        id = queue.get( block=True, timeout=None )

        #--- get the info and the thread pointer associated with this id ---
        doc, thread = listOfThreads[id]
        score  = doc[ "score" ]
        url    = doc[ "url" ]
        if debug:
            print "dequeued", id, score, url

        #--- get the bit of text around the keyword from the thread ---
        exerpt = thread.getExerpt()
        if debug:
            print "exerpt = ", exerpt
            
        #--- add a triplet to the list ---
        results.append( [score, url, exerpt] )

        #--- update the display with results sorted by scores ---
        displayResults( results )
        
        #--- stop when we have received all the texts back ---
        count += 1
        if count >= len( listOfDocs ):
            break
        
    if debug:
        print "main done!"

        
main( False )