CSC111 Lab 12 2018

From dftwiki3
Jump to: navigation, search

D. Thiebaut (talk) 08:28, 24 April 2018 (EDT)



...

<showafterdate after="20180428 12:00" before="20180601 00:00">

Better Fuzzy-Match Function


A better function for the fuzzy match is this function found on StackOverflow, and implementing the algorithm presented in this Wikipedia page.

def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_

    maxLen = max( len(s1), len(s2) )
    return (maxLen-distances[-1])/maxLen*100
    #return distances[-1]


Solution Programs


# lab12_1sol.py
# D. Thiebaut
# Solution program for Lab 12, 2018.
# Takes a string hard coded between """ and """ quotes,
# and computes the frequency of occurrence of each letter
# of the alphabet.  Other letters are not counted.
# It displays the string composed of the 10 most frequent
# letters sorted by frequency, most frequent first.
# It then outputs a series of percentages of degree 
# of matching between this string of 10 most frequent
# letters and the same strings known for various 
# languages.
# 
# Example of output:
# 
# Most frequent characters: eialrondsz
# Spanish        : 79.00% matching
# German         : 79.00% matching
# French         : 70.00% matching
# Italian        : 86.00% matching
# Dutch          : 77.00% matching
# Turkish        : 82.00% matching
# Polish         : 78.00% matching
# Swedish        : 71.00% matching

#import textwrap

FILENAME = "italianLong.txt"
FILENAME = "polishShortStory.txt"
FILENAME = "italianShort.txt"

SAMPLE = """Iliade, di Homerus (Omero) 
Sulla origine delle specie per elezione naturale,
ovvero conservazione delle razze perfezionate nella lotta per l'esistenza, di Charles Darwin 
Divina Commedia, di Dante Alighieri 
I promessi sposi, di Alessandro Manzoni 
Aforismi, novelle e profezie, di Leonardo 
Internet 2004, di M. Calvo, F. Ciotti, G. Roncaglia, M. A. Zela 
Odissea, di Homerus (Omero) 
Anna Karenina, di Lev Tolstoj 
Cos  (se vi pare), di Luigi Pirandello 
Sogno di una notte di mezza estate, di William Shakespeare 
Il fu Mattia Pascal, di Luigi Pirandello 
Trattato della Pittura, di Leonardo 
Romeo e Giulietta, di William Shakespeare 
Corano, vedi Corano 
La Sacra Bibbia, vedi Bibbia 
Giulio Cesare, di William Shakespeare 
Jolanda, la figlia del corsaro nero, di Emilio Salgari 
La coscienza di Zeno, di Italo Svevo 
Don Chisciotte della Mancia, di Miguel de Cervantes 
Novelle per un anno, di Luigi Pirandello
"""

def fuzzyMatch( string1, string2 ):
    '''given two strings of identical length,
    returns the percentage of match between the two.
    This function is useful when the strings do not
    match exactly, but are close'''
        
    sum = 0
    for i in range( len( string1 ) ):
        char = string1[ i ]
        index = string2.find( char )
        if index == -1:
            index = len( string1 )
        editDistance = abs( i-index )
        sum += editDistance
    return (100-sum)

    
def main():
    # 
    # read file
    # text = open( FILENAME, 'r' ).read()
    text = SAMPLE
    
    #print( "\n".join( textwrap.wrap( text, 60 ) ) )
    
    # compute the frequency of occurence of all letters
    letterFreq = {}
    for char in text[0:200].lower():
        if char in 'abcdefghijklmnopqrstuvwxyz':
            try:
                letterFreq[ char ] += 1
            except:
                letterFreq[ char ] = 0

    # create a list of tuples (frequency, letter)
    freqList = [ ]
    for char in letterFreq:
        tuple = ( letterFreq[char], char )
        freqList.append(  tuple )

    # sort list of pairs, most frequent first
    freqList.sort()
    freqList.reverse()

    # display the characters sorted by most to least frequent
    charList = ""
    for tuple in freqList:
        charList = charList + tuple[1] # we strip the frequency

    # print resulting string, only first 10 chars
    mysteryFreqString = charList[0:10]
    print( "Most frequent characters:", mysteryFreqString )

    # figure out the language
    languageStrings = [('Spanish', 'eaosrnidlc'), ('German', 'enisratdhu'),
                       ('French', 'esaitnrulo'),  ('Italian', 'eaionlrtsc'),
                       ('Dutch', 'enatirodsl'), ('Turkish', 'aeinrl\xc4\xb1dk'), 
                       ('Polish', 'iaeoznscrw'), ('Swedish', 'eantrslido')]

    for language, languageString in languageStrings:
        percent = fuzzyMatch( languageString, mysteryFreqString )
        print( "{0:15}: {1:1.2f}% matching".format( language, percent ) )
               
main()


</showafterdate>