CSC352 Homework 2 Solution 2

From dftwiki3
Revision as of 08:19, 24 February 2010 by Thiebaut (talk | contribs)
Jump to: navigation, search

--D. Thiebaut 23:14, 22 February 2010 (UTC)


#! /usr/bin/python
# D. Thiebaut
# multiprocessRetrieveEtext.py
#
# *** Works only with Python 2.6 or newer! ***
#
# This program is the multiprocessing version of the program that fetches
# text from a repository indexed by swishe.
# This version probes the repository for 20 docs (or fewer) that contain 
# the given keyword.  The result is an xml list of 20 document urls.
# The program parses this list, extracts the 20 raw urls, and launches
# 20 processes to fetch the 20 files at the given urls.
# The processes then extract a short context of words around the keyword
# and pass that back to the main program through a shared queue.
#
# Example of use:
# 
#   python2.6 multiprocessRetrieveEtext.py lover
#
import sys
import urllib, urllib2
import textwrap
import threading
import Queue        # use "queue" lowercase in Python 3
import time
import random
import multiprocessing

#---------------------------- GLOBALS ---------------------------------
MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
 Gecko/20071127 Firefox/2.0.0.11"""
HEADERS       = {"User-Agent"  : VERSION }




def getListOfDocs( keyword, debug=False ):
    """probes the main server for 20 urls of files containing the given
    keyword"""
    url = MAINURL
    args = { "search" : keyword }
    if debug: print "args = ", str( args )
    req = urllib2.Request( url,  urllib.urlencode( args ), HEADERS )
    f = urllib2.urlopen( req )
    return f.read()

def parse( text, debug=False ):
    """takes the XML code returned by the swishe server and extracts the
    information.  Stores the information for each file in a dictionary
    { "url": string, "score": integer score, "file": name of file, "offset": integer }
    The final result is a list of these dictionaries, one per file """
    lines = text.split( "\n" )
    list = []
    doc  = {}
    for line in lines:
        if line.find( "<br>url:" )!=-1: 
            doc["url"] = line.split( "url:" )[1].strip() 
        if line.find( "<br>score:" )!=-1: 
            doc["score"] = int( line.split( "score:" )[1].strip() ) 
        if line.find( "<br>file:" )!=-1: 
            doc["file"] = line.split( "file:" )[1].strip() 
        if line.find( "<br>offset:" )!=-1: 
            doc["offset"] = int( line.split( "offset:" )[1].strip() ) 
            list.append( doc )
            doc = {}
        
    return list

def getOneDoc( url, keyword, queue ):
    """This function is the one executed by the parallel processes.  It fetches
    the whole file from the given Url, and takes a sample of words surrounding
    the keyword, and stores that in the queue, which main() is blocking on."""

    #--- fetch the file at the given url ---
    args = { }
    req = urllib2.Request( url,  urllib.urlencode( args ), HEADERS )
    f = urllib2.urlopen( req )
    text = f.read()

    #--- locate the keyword ---
    offset = text.find( keyword ) 
    if offset != -1:
        exerpt = text[ max(offset-60, 0):min( offset+60, len(text) )]
    else:
        exerpt = text

    #--- remove duplicate white spaces ---
    exerpt = ' '.join( exerpt.split()[1:-1] )

    #--- wrap text at 40
    exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
    queue.put( exerpt )

def getAllDocs( listOfDocs, keyword, debug=False ):
    """given the keyword, ask the main server for 20 documents that contain
    the keyword"""
    for doc in listOfDocs:
        url = doc[ "url" ]
        fileName = doc[ "file" ]
        if debug:
            print "-" * 40
            print "fetching url:", url
        text = getOneDoc( url, keyword, debug )
        if debug: print text        
        
# ------------------------------------------------------------------------------------
def main( keyword, debug=False ):
    """Main program"""
    if len( sys.argv ) > 1:
       keyword = sys.argv[1]
 
    #--- get all 20 docs as a list of dictionaries ---
    text = getListOfDocs( keyword, debug )
    if debug: print "text = ", text
    listOfDocs = parse( text, False )

    #--- get ready to launche one process per url received ---
    processList = []
    queue = multiprocessing.Queue()

    for i, doc in enumerate( listOfDocs ):
        url = doc[ "url" ]
        p = multiprocessing.Process( target = getOneDoc, args =(url, keyword, queue) )
        p.start()
        processList.append( p )
        if debug: print "Launched Process #", i+1

    #--- wait for information back ---
    count = 0
    while True:
        #--- block wait on the queue for something back ---
        try:
            text = queue.get( True, 30 ) # wait at most 30 sec.
        except:
            break              # and if timeout, break out!

        #--- get the text and print it ---
        print "-" * 40
        print text

        #--- if we have heard from all processes, then stop ---
        count += 1
        if count >= len( listOfDocs ):
            break
    
    print "Done!"

main( "love", False ) # search for "love"...