CSC352 Project1 Solution

From dftwiki3
Revision as of 09:33, 11 March 2010 by Thiebaut (talk | contribs)
Jump to: navigation, search

--D. Thiebaut 14:24, 11 March 2010 (UTC)


The solutions I picked was submitted by Yang and by Diana.

Analysis

Two reports: pdf1 and pdf2 .

Source Files

Multiprocessing

#! /usr/bin/python
"""
proj1MultiProc.py

CSC352 Spring 2010
Yang Li

This is a multiprocessing program that retrieves 20 files 
containing the searched term and analyze the word frequency 
of each retrieved file.
It scores each file by the frequency ranking of the search term 
among other words in that file. The search results are ordered in decreasing scores.

To run:
    chmod +x proj1MultiProc.py
    ./proj1MultiProc.py <keyword>
"""
import urllib, urllib2
import multiprocessing
import textwrap,re,sys

MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
 Gecko/20071127 Firefox/2.0.0.11"""
HEADERS       = {"User-Agent"  : VERSION }

#list of stopwords
STOPWORDS = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can',
 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 
'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how',
 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may',
 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 
'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should',
 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 
'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 
'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']



def contentSearch(f_url,keywd,queue):
    """
    Fetch file from url, find context of keyword and store to queue
    f_url:      file url
    keyword:    search keyword
    queue:      multiprocessing queue object that stores output
    """
    output="fetching url: "+f_url+"\n"
    #read each file
    f_req = urllib2.Request(f_url,None,HEADERS)
    try:
        f = urllib2.urlopen( f_req )
    except urllib2.URLError:
        return
    text = f.read()
    #find keyword
    i = text.find(keywd)
    if i != -1:
        #find context
        start=0
        end=-1
        if i>=50:
            start=i-50
        if len(text)>i+50:
            end=i+50
        output+="\n".join(textwrap.wrap(" ".join(text[start:end].split()[1:-1]),40))
        #find score
        relavence = score(text,keywd)
        queue.put([relavence,output])
    f.close() 
    
def score(text,keyword):
    """
    compute word frequency and document score
    text:       string of text to be analyzed
    keyword:    search keyword
    """
    freqDict = dict() #dictionary of word-freqency pairs
    wordlist = re.split('\W+',text) #split text into words
                                        #ignoring punctuations
    for word in wordlist:
        if word in STOPWORDS:
            #ignore stop word
            continue
        if word not in freqDict:
            freqDict[word]=1
        else:
            freqDict[word]=freqDict[word]+1
    #print freqDict
    sorted = [] #list of freqency-word pairs for sorting
    for word,freq in freqDict.iteritems():
        sorted.append([freq,word])
        sorted.sort()
        sorted.reverse()
    sc=-1  #page score
    for i in range(len(sorted)):
        if sorted[i][1]==keyword:
            sc = 1.0/(1+i)
            break
    return sc
    
""" Main class """
def main():
    if len( sys.argv ) > 1:
       keyword = sys.argv[1]
    else:
       print "Usage: ./multiprocessRetrieveEText.py <keyword1,keyword2,...>"
       quit()
    url = MAINURL    
    args = { "search" : keyword }
    req = urllib2.Request( url,  urllib.urlencode( args ), HEADERS )
    result = urllib2.urlopen( req )
    lines = result.readlines()
    result.close()
    
    # list of processes
    list = [];
    queue = multiprocessing.Queue(20);
    # read search result and create a process for each url
    for line in lines:
        if line.find( "<br>url:" ) != -1:
            f_url =  line.split( "url:" )[1].strip()
            p = multiprocessing.Process(target=contentSearch,args=(f_url,keyword,queue))
            p.start()
            list.append(p)
            
    # wait for all processes to finish
    for proc in list:
        proc.join()
    
    # print output
    result = []
    while not queue.empty():
        result.append(queue.get())
    result.sort()
    result.reverse()
    for i in range(len(result)):
        print "\n---------------------------"
        print "Rank: %d   Score: %f"% (i+1,result[i][0])
        print result[i][1]
        
    if i<20:
        print "\n==========================="
        print str(20-i)+" url(s) can not be opened.\n"

    print "\nDone"
            
    return 0

if __name__=="__main__":
    main()

Threading

#! /usr/bin/python
"""
proj1Thread.py

CSC352 Spring 2010
Yang Li

This is a multi-threaded program that retrieves 20 files 
containing the searched term and analyze the word frequency 
of each retrieved file.
It scores each file by the frequency ranking of the search term 
among other words in that file. The search results are ordered in decreasing scores.

To run:
    chmod +x proj1Thread.py
    ./proj1Thread.py <keyword>
"""
import urllib, urllib2
import multiprocessing
import textwrap
import sys,re
from threading import Thread
from Queue import Queue       # use "queue" lowercase in Python 3

MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
 Gecko/20071127 Firefox/2.0.0.11"""
HEADERS       = {"User-Agent"  : VERSION }

#list of stopwords
STOPWORDS = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by',
 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every',
 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how',
 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may',
 'me', 'might', 'most', 'must', 'my', 'neither',
 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather',
 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their',
 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 
'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 
'with', 'would', 'yet', 'you', 'your']

"""extended thread class"""
class urlRetrieverThread( Thread ):
    
    def __init__(self,f_url,keyword,queue):
        Thread.__init__(self)
        self.f_url = f_url; #file url
        self.keyword = keyword; #search keyword
        self.queue=queue; #multiprocessing queue that stores output
       

    def run(self):
        """
        Fetch file from url, find context of keyword and compute scores
        """
        output="fetching url: "+self.f_url+"\n"
        #read each file
        f_req = urllib2.Request(self.f_url,None,HEADERS)
        try:
            f = urllib2.urlopen( f_req )
        except urllib2.URLError:
            return
        text = f.read()
        #find keyword
        i = text.find(self.keyword)
        if i != -1:
            #find context
            start=0
            end=-1
            if i>=50:
                start=i-50
            if len(text)>i+50:
                end=i+50
            output+="\n".join(textwrap.wrap(" ".join(text[start:end].split()[1:-1]),40))
            #find score
            relavence = self.score(text)
            self.queue.put([relavence,output])
        f.close()
        
    def score(self,text):
        """
        compute word frequency and document score
        text:   string of text to be analyzed
        """
        freqDict = dict() #dictionary of word-freqency pairs
        wordlist = re.split('\W+',text) #split text into words
                                        #ignoring punctuations
        for word in wordlist:
            if word in STOPWORDS:
                #ignore stop word
                continue
            if word not in freqDict:
                freqDict[word]=1
            else:
                freqDict[word]=freqDict[word]+1
        sorted = [] #list of freqency-word pairs for sorting
        for word,freq in freqDict.iteritems():
            sorted.append([freq,word])
            sorted.sort()
            sorted.reverse()
        sc=-1 #page score
        for i in range(len(sorted)):
            if sorted[i][1]==self.keyword:
                sc = 1.0/(1+i)
                break
        return sc                    


    
""" Main class """
def main():
    if len( sys.argv ) > 1:
       keyword = sys.argv[1]
       print "Search for keyword:",keyword,"...\n"
    else:
       print "Usage: ./multiprocessRetrieveEText.py <keyword1,keyword2,...>"
       quit()
    url = MAINURL    
    args = { "search" : keyword }
    req = urllib2.Request( url,  urllib.urlencode( args ), HEADERS )
    result = urllib2.urlopen( req )
    lines = result.readlines()
    result.close()
    
    # list of threads
    list = [];
    queue = Queue(20);
    # read search result and create a process for each url
    for line in lines:
        if line.find( "<br>url:" ) != -1:
            f_url =  line.split( "url:" )[1].strip()
            t = urlRetrieverThread(f_url,keyword,queue)
            t.start()
            list.append(t)
            
    # wait for all processes to finish
    for t in list:
        t.join()

    # print output
    result = []
    while not queue.empty():
        result.append(queue.get())
    result.sort()
    result.reverse()
    for i in range(len(result)):
        print "\n---------------------------"
        print "Rank: %d   Score: %f"% (i+1,result[i][0])
        print result[i][1]
        
    if i<20:
        print "\n==========================="
        print str(20-i)+" url(s) can not be opened.\n"

    print "\nDone"
            
    return 0


if __name__=="__main__":
    main()