#!/usr/bin/perl use strict; use warnings; #ECOL 553 Homework 8 #Solution, generated by Dan DeBlasio #14 November 2012 #take in file name my $fname = shift @ARGV; #open the file open FILE, $fname or die("$fname: $!"); my @sequences; #take in the file and put the sequences in the array; while(my $line = ){ chomp $line; if(substr($line,0,1) ne ">"){ push(@sequences, $line); } } #get the sequence lengths my $length = length($sequences[0]); #a binary (boolean) array if a column is variable # the size will be equal to the length my @sites; my $num_sites = 0; #holds the sum of the 2pq values; my $twopq = 0; #look at each column, find the number that are variable and sum the 2pq values #assume that each column only contains two types foreach my $i (0 ... $length-1){ my ($p,$q) = (0,0); my $consensus = substr($sequences[0],$i,1); foreach my $seq (@sequences){ my $char = substr($seq,$i,1); if($char eq $consensus){ $p++; }else{ $q++; } } #if column is variable, set @sites to 1 $sites[$i] = ($q>0); $num_sites += $sites[$i]; #make the p and q values frequencies $p /= scalar(@sequences); $q /= scalar(@sequences); #add this column to the 2pq sum $twopq += 2 * $p * $q; } #find the harmonic number for the number H_n-1 #then the generalized r-harmonic number H^2_n-1 # see http://en.wikipedia.org/wiki/Harmonic_number my $harmonic = 0.0; my $harmonic_2 = 0.0; foreach my $i (1 ... scalar(@sequences)-1){ $harmonic += 1/$i; $harmonic_2 += 1/($i*$i); } my $n = scalar(@sequences); #calculate pi and theta my $pi = (($n/($n-1)) * $twopq)/$length; my $theta = ($num_sites/$length) * (1/$harmonic); #calculate the variance of pi my $b1 = (($n+1)/(3*($n-1))); my $b2 = (2*(($n*$n) + $n + 3))/(9 * $n * ($n-1)); my $var_pi = (($b1 * $pi)/($length)) + ($b2 * $pi * $pi); #calculate the variance of theta my $var_theta = ($theta/($length * $harmonic)) + (($harmonic_2 * $theta * $theta)/($harmonic * $harmonic)); #print result print "pi: $pi\tvariance pi: $var_pi\n"; print "theta: $theta\tvariance theta: $var_theta\n"; #print the variable columns foreach my $seq (@sequences){ foreach my $i (0 ... $length-1){ if($sites[$i]){ print STDERR substr($seq,$i,1); } } print STDERR "\n"; }