Difference between revisions of "CSC352 Homework 2 Solution 2"
Line 158: | Line 158: | ||
<br /> | <br /> | ||
<br /> | <br /> | ||
− | [[Category:CSC352]][[Category:Python]][[Category:Multiprocessing]] | + | [[Category:CSC352]][[Category:Python]][[Category:Multiprocessing]][[Category:Homework]] |
Latest revision as of 16:26, 14 April 2010
--D. Thiebaut 23:14, 22 February 2010 (UTC)
#! /usr/bin/python
# D. Thiebaut
# multiprocessRetrieveEtext.py
#
# *** Works only with Python 2.6 or newer! ***
#
# This program is the multiprocessing version of the program that fetches
# text from a repository indexed by swishe.
# This version probes the repository for 20 docs (or fewer) that contain
# the given keyword. The result is an xml list of 20 document urls.
# The program parses this list, extracts the 20 raw urls, and launches
# 20 processes to fetch the 20 files at the given urls.
# The processes then extract a short context of words around the keyword
# and pass that back to the main program through a shared queue.
#
# Example of use:
#
# python2.6 multiprocessRetrieveEtext.py lover
#
import sys
import urllib, urllib2
import textwrap
import threading
import Queue # use "queue" lowercase in Python 3
import time
import random
import multiprocessing
#---------------------------- GLOBALS ---------------------------------
MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
Gecko/20071127 Firefox/2.0.0.11"""
HEADERS = {"User-Agent" : VERSION }
def getListOfDocs( keyword, debug=False ):
"""probes the main server for 20 urls of files containing the given
keyword"""
url = MAINURL
args = { "search" : keyword }
if debug: print "args = ", str( args )
req = urllib2.Request( url, urllib.urlencode( args ), HEADERS )
f = urllib2.urlopen( req )
return f.read()
def parse( text, debug=False ):
"""takes the XML code returned by the swishe server and extracts the
information. Stores the information for each file in a dictionary
{ "url": string, "score": integer score, "file": name of file, "offset": integer }
The final result is a list of these dictionaries, one per file """
lines = text.split( "\n" )
list = []
doc = {}
for line in lines:
if line.find( "<br>url:" )!=-1:
doc["url"] = line.split( "url:" )[1].strip()
if line.find( "<br>score:" )!=-1:
doc["score"] = int( line.split( "score:" )[1].strip() )
if line.find( "<br>file:" )!=-1:
doc["file"] = line.split( "file:" )[1].strip()
if line.find( "<br>offset:" )!=-1:
doc["offset"] = int( line.split( "offset:" )[1].strip() )
list.append( doc )
doc = {}
return list
def getOneDoc( url, keyword, queue ):
"""This function is the one executed by the parallel processes. It fetches
the whole file from the given Url, and takes a sample of words surrounding
the keyword, and stores that in the queue, which main() is blocking on."""
#--- fetch the file at the given url ---
args = { }
req = urllib2.Request( url, urllib.urlencode( args ), HEADERS )
f = urllib2.urlopen( req )
text = f.read()
#--- locate the keyword ---
offset = text.find( keyword )
if offset != -1:
exerpt = text[ max(offset-60, 0):min( offset+60, len(text) )]
else:
exerpt = text
#--- remove duplicate white spaces ---
exerpt = ' '.join( exerpt.split()[1:-1] )
#--- wrap text at 40
exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
queue.put( exerpt )
def getAllDocs( listOfDocs, keyword, debug=False ):
"""given the keyword, ask the main server for 20 documents that contain
the keyword"""
for doc in listOfDocs:
url = doc[ "url" ]
fileName = doc[ "file" ]
if debug:
print "-" * 40
print "fetching url:", url
text = getOneDoc( url, keyword, debug )
if debug: print text
# ------------------------------------------------------------------------------------
def main( keyword, debug=False ):
"""Main program"""
if len( sys.argv ) > 1:
keyword = sys.argv[1]
#--- get all 20 docs as a list of dictionaries ---
text = getListOfDocs( keyword, debug )
if debug: print "text = ", text
listOfDocs = parse( text, False )
#--- get ready to launche one process per url received ---
processList = []
queue = multiprocessing.Queue()
for i, doc in enumerate( listOfDocs ):
url = doc[ "url" ]
p = multiprocessing.Process( target = getOneDoc, args =(url, keyword, queue) )
p.start()
processList.append( p )
if debug: print "Launched Process #", i+1
#--- wait for information back ---
count = 0
while True:
#--- block wait on the queue for something back ---
try:
text = queue.get( True, 30 ) # wait at most 30 sec.
except:
break # and if timeout, break out!
#--- get the text and print it ---
print "-" * 40
print text
#--- if we have heard from all processes, then stop ---
count += 1
if count >= len( listOfDocs ):
break
print "Done!"
main( "love", False ) # search for "love"...