APPENDIX B 97

for i in range(num_runs):

[a_prime, b_prime, c_prime, d_prime] = shuffle([a + b, c + d], [a + c, b + d])

prob_tail = fishers_exact_test(a_prime, b_prime, c_prime, d_prime)

if (prob_tail <= observed_prob_tail):

count = count + 1

######################################

#

# Output

#

######################################

print "Observed Fisher's Exact Test: %.4f" % observed_prob_tail

print count, "out of 10000 experiments had a Fisher's Exact Test less than or equal to

%.4f" % observed_prob_tail

print "Probability that chance alone gave us a Fisher's Exact Test",

print "of %.4f" % observed_prob_tail, "or less is", (count / float(num_runs))

ﬁshersexact.vals

>tea taster claimed milk first

3 1

>tea taster claimed tea first

2 4

OneWayAnovaSig.py

#!/usr/bin/python

######################################

# One-Way ANOVA Significance Test

# From: Statistics is Easy! By Dennis Shasha and Manda Wilson

#

#

# Assuming that there is no difference in the three drugs used, tests to see the

probability

# of getting a f-statistic by chance alone greater than or equal to the one observed.

# Uses shuffling to get a distribution to compare

# to the observed f-statistic.

#

# Author: Manda Wilson

#

# Example of FASTA formatted input file:

# >grp_a

# 45 44 34 33 45 46 34

# >grp_b

# 34 34 50 49 48 39 45

# >grp_c

# 24 34 23 25 36 28 33 29

#

# Pseudocode:

98 STATISTICS IS EASY!

#

# 1. Calculate f-statistic for the observed values (in our example it is 11.27).

# a. Initialize within sum of squares (wss), total sum (ts),

# total count (tc), and between sum of squares (bss) to 0

# b. For each group:

# i. Within group mean (wgm) = sum of group values / number of values in group

# ii. Store the number of values in group

# iii. Sum the following: for each value in the group

# I. Subtract the value from wgm

# II. Square the result of step (1biiiI)

# III. Add the result of step (1biiiII) to wss

# c. Total mean (tm) = ts / tc

# d. For each group:

# i. Subtract the wgm from tm

# ii. Square the result of step (1di)

# iii. Multiply the result of step (1dii) by

# the number of values in that group (stored in step (1bii))

# iv. Add the result of step (1diii) to bss

# e. Between degrees of freedom (bdf) = number of groups - 1

# f. Within degrees of freedome (wdf) = tc - number of groups

# g. Within group variance (wgv) = wss / wdf

# h. Between group variance (bgv) = bss / bdf

# i. f-statistic = wgv / bgv

#

# 2. Set a counter to 0, this will count the number of times we get a f-statistic

# greater than or equal to 11.27.

#

# 3. Do the following 10,000 times:

# a. Shuffle the observed values. To do this:

# i. Put the values from all the groups into one array

# ii. Shuffle the pooled values

# iii. Reassign the pooled values to groups of the same size as the original

groups

# b. Calculate f-statistic on the results from step (3a), just as we did in step (1)

# c. If the result from step (3b) is greater than or equal to our observed f-

statistic (11.27),

# increment our counter from step (2).

#

# 3. counter / 10,000 equals the probability of getting a f-statistic greater than or

equal to

# 11.27, assuming there is no difference between the groups

#

######################################

import random

######################################

#

# Adjustable variables

#

######################################

APPENDIX B 99

input_file = 'input/OneWayAnova.vals'

######################################

#

# Subroutines

#

######################################

# a list of lists

def shuffle(grps):

num_grps = len(grps)

pool = []

# throw all values together

for i in range(num_grps):

pool.extend(grps[i])

# mix them up

random.shuffle(pool)

# reassign to groups

new_grps = []

start_index = 0

end_index = 0

for i in range(num_grps):

end_index = start_index + len(grps[i])

new_grps.append(pool[start_index:end_index])

start_index = end_index

return new_grps

def sumofsq(vals, mean):

# the sum of squares for a group is calculated

# by getting the sum of the squared difference

# of each value and the mean of the group that value belongs to

count = len(vals)

total = 0

for i in range (count):

diff_sq = (vals[i] - mean)**2

total += diff_sq

return total

def weightedsumofsq(vals, weights, mean):

count = len(vals)

total = 0

for i in range (count):

diff_sq = (vals[i] - mean)**2

total += weights[i] * diff_sq

return total

# expects list of lists

def onewayanova(grps):

num_grps = len(grps)

within_group_means = []

Get *Statistics is Easy!* now with O’Reilly online learning.

O’Reilly members experience live online training, plus books, videos, and digital content from 200+ publishers.