ThreadedSearchKeywordRetrieveEtexts.py

From dftwiki3
Revision as of 14:11, 24 January 2010 by Thiebaut (talk | contribs) (New page: --~~~~ <source lang="python"> #! /usr/bin/python # D. Thiebaut # threadedRetrieveEtext.py # # feed key words, get list of docs, then download docs # and process them import sys import url...)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search

--D. Thiebaut 19:11, 24 January 2010 (UTC)

#! /usr/bin/python
# D. Thiebaut
# threadedRetrieveEtext.py
#
# feed key words, get list of docs, then download docs
# and process them
import sys
import urllib, urllib2
import textwrap
import threading
import Queue        # use "queue" lowercase in Python 3
import time
import random

MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
 Gecko/20071127 Firefox/2.0.0.11"""
HEADERS       = {"User-Agent"  : VERSION }



class urlRetrieverThread( threading.Thread ):
    
    def __init__( self, url, keyword, queue=None, Id=None ):
        threading.Thread.__init__( self )
        self.Id        = Id
        self.queue     = queue
        self.url       = url      # the url of the text/page to be retrieved
        self.text      = None     # the page retrieved
        self.keyword   = keyword  # the keyword to search for, if any
        self.args      = {}
        if keyword is not None:
            self.args = { "search" : keyword }
            
    def getPage( self ):
        """returns the page retrieved by this thread.  Make sure
        isAlive() returns false first, to make sure the thread has
        finished before calling this."""
        return self.text

    def getExerpt( self ):
        """returns the 1st portion of the page retrieved by this thread
        surrounding the current keyword.  Make sure
        isAlive() returns false first, to make sure the thread has
        finished before calling this."""
        if self.keyword is None or self.text is None:
            return ""
        
        offset = self.text.find( self.keyword ) 
        exerpt = self.text
        if offset == -1:
            return ""
        exerpt = self.text[ max(offset-60, 0)
                          : min( offset+60, len(self.text) )]
        
        #--- remove duplicate white spaces and wrap at 40 chars---
        exerpt = ' '.join( exerpt.split()[1:-1] )
        exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
        return exerpt

    def run( self ):        
        req = urllib2.Request( self.url,
                               urllib.urlencode( self.args ),
                               HEADERS )
        f = urllib2.urlopen( req )
        self.text = f.read()

        #--- introduce additional delay to see how result information
        #--- from all the threads evolved with time.
        #time.sleep( random.randint( 1, 50 ) /10 )
        
        #--- tell main program we're done ---
        if self.queue is not None:
            self.queue.put( self.Id )
            
def parse( text, debug=False ):
    lines = text.split( "\n" )
    list = []
    doc  = {}
    for line in lines:
        if line.find( "<br>url:" )!=-1: 
            doc["url"] = line.split( "url:" )[1].strip() 
        if line.find( "<br>score:" )!=-1: 
            doc["score"] = int( line.split( "score:" )[1].strip() ) 
        if line.find( "<br>file:" )!=-1: 
            doc["file"] = line.split( "file:" )[1].strip() 
        if line.find( "<br>offset:" )!=-1: 
            doc["offset"] = int( line.split( "offset:" )[1].strip() ) 
            list.append( doc )
            doc = {}
        
    return list


def getAllDocs( listOfDocs, keywords, debug=False ):
    for doc in listOfDocs:
        url = doc[ "url" ]
        fileName = doc[ "file" ]
        if debug:
            print "--------------------------------------------"
            print "fetching url:", url
        text = getOneDoc( url, keywords, debug )
        if debug: print text        


def goHome():
  sys.stderr.write( "\x1b[0;0H\x1b[2J" )

def displayResults( results ):
    results.sort( )
    results.reverse( )
    goHome()
    for i in range( len( results ) ):
        score, url, exerpt = results[ i ]
        print ""
        print "%d %s" % (score, url)
        print " -- ", ' '.join( exerpt.split( "\n" ) )

  
def main( keyword, debug=False ):
    if debug: print "starting main thread"
    mainThread = urlRetrieverThread( MAINURL, keyword )
    mainThread.start()
    mainThread.join()
    if debug: print "main thread done!"
    
    text = mainThread.getPage()
    if debug: print "list of docs = ",text

    listOfDocs = parse( text, debug )
    #if debug: print "list of docs = ", listOfDocs

    #--- create a queue for communication between threads ---
    #--- and main                                          ---
    queue = Queue.Queue( 0 )
        
    #--- start all the threads ---
    listOfThreads = {}
    for id, doc in enumerate( listOfDocs ):
        url = doc[ "url" ]
        thread = urlRetrieverThread( url, keyword, queue, id )
        listOfThreads[id] = (doc, thread)
        thread.start()
        if debug: print "starting thread", id
        
    count = 0

    results = []
    while True:
        id = queue.get( block=True, timeout=None )
        doc, thread = listOfThreads[id]
        score  = doc[ "score" ]
        url    = doc[ "url" ]
        if debug:
            print "dequeued", id, score, url
        exerpt = thread.getExerpt()
        if debug:
            print "exerpt = ", exerpt
            
        results.append( [score, url, exerpt] )
        displayResults( results )
        
        count += 1
        if count >= len( listOfDocs ):
            break
        
        
    if debug:
        print "main done!"

        
main( "love", False )