APPENDIX B 97
for i in range(num_runs):
[a_prime, b_prime, c_prime, d_prime] = shuffle([a + b, c + d], [a + c, b + d])
prob_tail = fishers_exact_test(a_prime, b_prime, c_prime, d_prime)
if (prob_tail <= observed_prob_tail):
count = count + 1
######################################
#
# Output
#
######################################
print "Observed Fisher's Exact Test: %.4f" % observed_prob_tail
print count, "out of 10000 experiments had a Fisher's Exact Test less than or equal to
%.4f" % observed_prob_tail
print "Probability that chance alone gave us a Fisher's Exact Test",
print "of %.4f" % observed_prob_tail, "or less is", (count / float(num_runs))
ﬁshersexact.vals
>tea taster claimed milk first
3 1
>tea taster claimed tea first
2 4
OneWayAnovaSig.py
#!/usr/bin/python
######################################
# One-Way ANOVA Significance Test
# From: Statistics is Easy! By Dennis Shasha and Manda Wilson
#
#
# Assuming that there is no difference in the three drugs used, tests to see the
probability
# of getting a f-statistic by chance alone greater than or equal to the one observed.
# Uses shuffling to get a distribution to compare
# to the observed f-statistic.
#
# Author: Manda Wilson
#
# Example of FASTA formatted input file:
# >grp_a
# 45 44 34 33 45 46 34
# >grp_b
# 34 34 50 49 48 39 45
# >grp_c
# 24 34 23 25 36 28 33 29
#
# Pseudocode:
98 STATISTICS IS EASY!
#
# 1. Calculate f-statistic for the observed values (in our example it is 11.27).
# a. Initialize within sum of squares (wss), total sum (ts),
# total count (tc), and between sum of squares (bss) to 0
# b. For each group:
# i. Within group mean (wgm) = sum of group values / number of values in group
# ii. Store the number of values in group
# iii. Sum the following: for each value in the group
# I. Subtract the value from wgm
# II. Square the result of step (1biiiI)
# III. Add the result of step (1biiiII) to wss
# c. Total mean (tm) = ts / tc
# d. For each group:
# i. Subtract the wgm from tm
# ii. Square the result of step (1di)
# iii. Multiply the result of step (1dii) by
# the number of values in that group (stored in step (1bii))
# iv. Add the result of step (1diii) to bss
# e. Between degrees of freedom (bdf) = number of groups - 1
# f. Within degrees of freedome (wdf) = tc - number of groups
# g. Within group variance (wgv) = wss / wdf
# h. Between group variance (bgv) = bss / bdf
# i. f-statistic = wgv / bgv
#
# 2. Set a counter to 0, this will count the number of times we get a f-statistic
# greater than or equal to 11.27.
#
# 3. Do the following 10,000 times:
# a. Shuffle the observed values. To do this:
# i. Put the values from all the groups into one array
# ii. Shuffle the pooled values
# iii. Reassign the pooled values to groups of the same size as the original
groups
# b. Calculate f-statistic on the results from step (3a), just as we did in step (1)
# c. If the result from step (3b) is greater than or equal to our observed f-
statistic (11.27),
# increment our counter from step (2).
#
# 3. counter / 10,000 equals the probability of getting a f-statistic greater than or
equal to
# 11.27, assuming there is no difference between the groups
#
######################################
import random
######################################
#
#
######################################
APPENDIX B 99
input_file = 'input/OneWayAnova.vals'
######################################
#
# Subroutines
#
######################################
# a list of lists
def shuffle(grps):
num_grps = len(grps)
pool = []
# throw all values together
for i in range(num_grps):
pool.extend(grps[i])
# mix them up
random.shuffle(pool)
# reassign to groups
new_grps = []
start_index = 0
end_index = 0
for i in range(num_grps):
end_index = start_index + len(grps[i])
new_grps.append(pool[start_index:end_index])
start_index = end_index
return new_grps
def sumofsq(vals, mean):
# the sum of squares for a group is calculated
# by getting the sum of the squared difference
# of each value and the mean of the group that value belongs to
count = len(vals)
total = 0
for i in range (count):
diff_sq = (vals[i] - mean)**2
total += diff_sq
def weightedsumofsq(vals, weights, mean):
count = len(vals)
total = 0
for i in range (count):
diff_sq = (vals[i] - mean)**2
total += weights[i] * diff_sq