Difference between revisions of "CSC352 Homework 2 Solution 1"
(Created page with '--~~~~ ---- <onlydft> <source lang="python"> #! /usr/bin/python # D. Thiebaut # threadedRetrieveEtext.py # # feed key words, get list of docs, then download docs # and process t…') |
|||
(4 intermediate revisions by the same user not shown) | |||
Line 2: | Line 2: | ||
---- | ---- | ||
− | |||
<source lang="python"> | <source lang="python"> | ||
#! /usr/bin/python | #! /usr/bin/python | ||
Line 8: | Line 7: | ||
# threadedRetrieveEtext.py | # threadedRetrieveEtext.py | ||
# | # | ||
− | # | + | # This program gets a keyword from the command line, and |
− | # and | + | # launches a thread that accesses a server and prompts it for |
+ | # a list of 20 Urls of files containing that keyword. | ||
+ | # The program then launches 20 threads on the 20 different threads | ||
+ | # and parallelizes the wait of the 20 threads. | ||
+ | # When the 20 threads are done, the main program gets the | ||
+ | # excerpts from each file with a few words before and after the | ||
+ | # keyword, and prints the results on the screen. | ||
+ | |||
import sys | import sys | ||
import urllib, urllib2 | import urllib, urllib2 | ||
Line 194: | Line 200: | ||
</source> | </source> | ||
− | </ | + | |
+ | <br /> | ||
+ | <br /> | ||
+ | <br /> | ||
+ | <br /> | ||
+ | <br /> | ||
+ | <br /> | ||
+ | [[Category:CSC352]][[Category:Python]][[Category:Threads]][[Category:Homework]] |
Latest revision as of 15:26, 14 April 2010
--D. Thiebaut 22:38, 22 February 2010 (UTC)
#! /usr/bin/python
# D. Thiebaut
# threadedRetrieveEtext.py
#
# This program gets a keyword from the command line, and
# launches a thread that accesses a server and prompts it for
# a list of 20 Urls of files containing that keyword.
# The program then launches 20 threads on the 20 different threads
# and parallelizes the wait of the 20 threads.
# When the 20 threads are done, the main program gets the
# excerpts from each file with a few words before and after the
# keyword, and prints the results on the screen.
import sys
import urllib, urllib2
import textwrap
import threading
import Queue # use "queue" lowercase in Python 3
import time
import random
MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
Gecko/20071127 Firefox/2.0.0.11"""
HEADERS = {"User-Agent" : VERSION }
class urlRetrieverThread( threading.Thread ):
def __init__( self, url, keyword, queue=None, Id=None ):
threading.Thread.__init__( self )
self.Id = Id
self.queue = queue
self.url = url # the url of the text/page to be retrieved
self.text = None # the page retrieved
self.keyword = keyword # the keyword to search for, if any
self.args = {}
if keyword is not None:
self.args = { "search" : keyword }
def getPage( self ):
"""returns the page retrieved by this thread. Make sure
isAlive() returns false first, to make sure the thread has
finished before calling this."""
return self.text
def getExerpt( self ):
"""returns the 1st portion of the page retrieved by this thread
surrounding the current keyword. Make sure
isAlive() returns false first, to make sure the thread has
finished before calling this."""
if self.keyword is None or self.text is None:
return ""
offset = self.text.find( self.keyword )
exerpt = self.text
if offset == -1:
return ""
exerpt = self.text[ max(offset-60, 0)
: min( offset+60, len(self.text) )]
#--- remove duplicate white spaces and wrap at 40 chars---
exerpt = ' '.join( exerpt.split()[1:-1] )
exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
return exerpt
def run( self ):
req = urllib2.Request( self.url,
urllib.urlencode( self.args ),
HEADERS )
f = urllib2.urlopen( req )
self.text = f.read()
#--- Just to see what is happening, this code section
#--- will add some additional delay to see how result information
#--- from all the threads evolved with time.
#--- Removee this line for normal operation!
time.sleep( random.randint( 1, 50 ) /10 )
#--- tell main program we're done ---
if self.queue is not None:
self.queue.put( self.Id )
def parse( text, debug=False ):
lines = text.split( "\n" )
list = []
doc = {}
for line in lines:
if line.find( "<br>url:" )!=-1:
doc["url"] = line.split( "url:" )[1].strip()
if line.find( "<br>score:" )!=-1:
doc["score"] = int( line.split( "score:" )[1].strip() )
if line.find( "<br>file:" )!=-1:
doc["file"] = line.split( "file:" )[1].strip()
if line.find( "<br>offset:" )!=-1:
doc["offset"] = int( line.split( "offset:" )[1].strip() )
list.append( doc )
doc = {}
return list
def getAllDocs( listOfDocs, keywords, debug=False ):
for doc in listOfDocs:
url = doc[ "url" ]
fileName = doc[ "file" ]
if debug:
print "--------------------------------------------"
print "fetching url:", url
text = getOneDoc( url, keywords, debug )
if debug: print text
def goHome():
"""bring cursor to top left position on the screen"""
sys.stderr.write( "\x1b[0;0H\x1b[2J" )
def displayResults( results ):
""" update the list of results on the screen"""
results.sort( )
results.reverse( )
goHome()
for i in range( len( results ) ):
score, url, exerpt = results[ i ]
print ""
print "%d %s" % (score, url)
print " -- ", ' '.join( exerpt.split( "\n" ) )
def main( debug=False ):
if len( sys.argv ) > 1:
keyword = sys.argv[1]
if debug: print "starting main thread"
mainThread = urlRetrieverThread( MAINURL, keyword )
mainThread.start()
mainThread.join()
if debug: print "main thread done!"
text = mainThread.getPage()
if debug: print "list of docs = ",text
listOfDocs = parse( text, debug )
#--- create a queue for communication between threads ---
#--- and main ---
queue = Queue.Queue( 0 )
#--- start all the threads ---
listOfThreads = {}
for id, doc in enumerate( listOfDocs ):
url = doc[ "url" ]
thread = urlRetrieverThread( url, keyword, queue, id )
listOfThreads[id] = (doc, thread)
thread.start()
if debug: print "starting thread", id
#--- wait for the threads to pass their Id back through the queue---
results = []
count = 0
while True:
#--- get the Id of the thread that is done ---
id = queue.get( block=True, timeout=None )
#--- get the info and the thread pointer associated with this id ---
doc, thread = listOfThreads[id]
score = doc[ "score" ]
url = doc[ "url" ]
if debug:
print "dequeued", id, score, url
#--- get the bit of text around the keyword from the thread ---
exerpt = thread.getExerpt()
if debug:
print "exerpt = ", exerpt
#--- add a triplet to the list ---
results.append( [score, url, exerpt] )
#--- update the display with results sorted by scores ---
displayResults( results )
#--- stop when we have received all the texts back ---
count += 1
if count >= len( listOfDocs ):
break
if debug:
print "main done!"
main( False )