--D. Thiebaut 19:11, 24 January 2010 (UTC)
#! /usr/bin/python
# D. Thiebaut
# feed key words, get list of docs, then download docs
# and process them
import sys
import urllib, urllib2
import textwrap
import threading
import Queue # use "queue" lowercase in Python 3
import time
import random
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:
Gecko/20071127 Firefox/"""
HEADERS = {"User-Agent" : VERSION }
class urlRetrieverThread( threading.Thread ):
def __init__( self, url, keyword, queue=None, Id=None ):
threading.Thread.__init__( self )
self.Id = Id
self.queue = queue
self.url = url # the url of the text/page to be retrieved
self.text = None # the page retrieved
self.keyword = keyword # the keyword to search for, if any
self.args = {}
if keyword is not None:
self.args = { "search" : keyword }
def getPage( self ):
"""returns the page retrieved by this thread. Make sure
isAlive() returns false first, to make sure the thread has
finished before calling this."""
return self.text
def getExerpt( self ):
"""returns the 1st portion of the page retrieved by this thread
surrounding the current keyword. Make sure
isAlive() returns false first, to make sure the thread has
finished before calling this."""
if self.keyword is None or self.text is None:
return ""
offset = self.text.find( self.keyword )
exerpt = self.text
if offset == -1:
return ""
exerpt = self.text[ max(offset-60, 0)
: min( offset+60, len(self.text) )]
#--- remove duplicate white spaces and wrap at 40 chars---
exerpt = ' '.join( exerpt.split()[1:-1] )
exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
return exerpt
def run( self ):
req = urllib2.Request( self.url,
urllib.urlencode( self.args ),
f = urllib2.urlopen( req )
self.text =
#--- introduce additional delay to see how result information
#--- from all the threads evolved with time.
#time.sleep( random.randint( 1, 50 ) /10 )
#--- tell main program we're done ---
if self.queue is not None:
self.queue.put( self.Id )
def parse( text, debug=False ):
lines = text.split( "\n" )
list = []
doc = {}
for line in lines:
if line.find( "<br>url:" )!=-1:
doc["url"] = line.split( "url:" )[1].strip()
if line.find( "<br>score:" )!=-1:
doc["score"] = int( line.split( "score:" )[1].strip() )
if line.find( "<br>file:" )!=-1:
doc["file"] = line.split( "file:" )[1].strip()
if line.find( "<br>offset:" )!=-1:
doc["offset"] = int( line.split( "offset:" )[1].strip() )
list.append( doc )
doc = {}
return list
def getAllDocs( listOfDocs, keywords, debug=False ):
for doc in listOfDocs:
url = doc[ "url" ]
fileName = doc[ "file" ]
if debug:
print "--------------------------------------------"
print "fetching url:", url
text = getOneDoc( url, keywords, debug )
if debug: print text
def goHome():
sys.stderr.write( "\x1b[0;0H\x1b[2J" )
def displayResults( results ):
results.sort( )
results.reverse( )
for i in range( len( results ) ):
score, url, exerpt = results[ i ]
print ""
print "%d %s" % (score, url)
print " -- ", ' '.join( exerpt.split( "\n" ) )
def main( keyword, debug=False ):
if debug: print "starting main thread"
mainThread = urlRetrieverThread( MAINURL, keyword )
if debug: print "main thread done!"
text = mainThread.getPage()
if debug: print "list of docs = ",text
listOfDocs = parse( text, debug )
#if debug: print "list of docs = ", listOfDocs
#--- create a queue for communication between threads ---
#--- and main ---
queue = Queue.Queue( 0 )
#--- start all the threads ---
listOfThreads = {}
for id, doc in enumerate( listOfDocs ):
url = doc[ "url" ]
thread = urlRetrieverThread( url, keyword, queue, id )
listOfThreads[id] = (doc, thread)
if debug: print "starting thread", id
count = 0
results = []
while True:
id = queue.get( block=True, timeout=None )
doc, thread = listOfThreads[id]
score = doc[ "score" ]
url = doc[ "url" ]
if debug:
print "dequeued", id, score, url
exerpt = thread.getExerpt()
if debug:
print "exerpt = ", exerpt
results.append( [score, url, exerpt] )
displayResults( results )
count += 1
if count >= len( listOfDocs ):
if debug:
print "main done!"
main( "love", False )