CSC334 genAminoHisto.pl

From dftwiki3
Jump to: navigation, search

--D. Thiebaut 02:24, 4 November 2008 (UTC)


#! /usr/bin/perl
# genAminoHisto.pl
# D. Thiebaut
# Syntax:
#
# ./genAminoHisto.pl N
# 
# where N is an integer.
# 
# Generates N random amino acids, and compute their histogram
# Typical use:
#
#    ./genAminoHisto.pl 1000
#
# The example above generates 1,000 amino acis, sorts them, and generates
# a histogram that is printed out
#
#--- check syntax ---
if ( $#ARGV < 0 ) {
    print "Syntax genAminoHisto.pl N\n";
    print "where N is the number of samples wanted\n\n";
    exit( 0 );
}

my @aminos;
my %histogram;


#--- put first words of lines past __END__ in array ---
while ( my $line=<DATA> ) {
    chomp( $line );
    @words = split " ", $line;
    my $first = shift @words;
    push( @aminos, $first );
    $histogram[ $first ] = 0;
}

#--- generate random sample ---
my $N = $ARGV[0];
my $aminoDim = @aminos;

#--- genrate N random numbers in [0..Dim] where Dim is the # of amino   ---
#--- acids, and pick the amino acid corresponding to that random number ---
for ( my $i=0; $i< $N; $i++ ) {
    my $k = int( rand( $aminoDim ) );
    #print "k = $k \n";
    my $amino = @aminos[ $k ];
    #print "amino = " . $amino . "\n";
    $histogram{ $amino } = $histogram{ $amino } + 1;
    #print "histo[ $amino ] = $histogram{ $amino }\n";
}

#--- print histogram ---
while ( ( $amino, $count ) = each( %histogram ) ) {
    print "$amino: $count\n";
}

__END__
ALA GCU GCC GCA GCG
ARG CGU CGC CGA CGG AGA AGG
ASN AAU AAC
ASP GAU GAC
CYS UGU UGC
GLN CAA CAG
GLU GAA GAG
GLY GGU GGC GGA GGG
HIS CAU CAC
ILE AUU AUC AUA
LEU UUA UUG CUU CUC CUA CUG
LYS AAA AAG
MET AUG
PHE UUU UUC
PRO CCU CCC CCA CCG
SER UCU UCC UCA UCG AGU AGC
THR ACU ACC ACA ACG
TRP UGG
TYR UAU UAC
VAL GUU GUC GUA GUG