APPENDIX B 71

# count the number of times we got greater than or equal to 15 heads out of 17 coin

tosses

countgood = len(filter(lambda x: x >= observed_number_of_heads, out))

######################################

#

# Output

#

######################################

print countgood, "out of", number_of_bootstraps, "times we got at least",

print observed_number_of_heads, "heads in", number_of_tosses, "tosses."

print "Probability that chance alone gave us at least", observed_number_of_heads,

print "heads in", number_of_tosses, "tosses is", countgood /

float(number_of_bootstraps), "."

Diff2MeanSig.py

#!/usr/bin/python

######################################

# Difference between Two Means Significance Test

# From: Statistics is Easy! By Dennis Shasha and Manda Wilson

#

#

# Assuming that there is no significant difference in the means of

# the two samples, tests to see the probability of getting a difference

# greater than or equal to the observed difference in the means by chance

# alone. Uses shuffling & bootstrapping to get a distribution to compare

# to the observed statistic.

#

# Author: Manda Wilson

#

# Example of FASTA formatted input file (this is only for 2 groups):

# >placebo_vals

# 54 51 58 44 55 52 42 47 58 46

# >drug_vals

# 54 73 53 70 73 68 52 65 65

#

# Pseudocode:

#

# 1. Measure the difference between the two group means. The difference in means is

measured

# by (sum(grpA) / len(grpA)) - (sum(grpB) / len(grpB)). In this example the

difference between

# the two group means is 12.97.

#

# 2. Set a counter to 0, this will count the number of times we get a difference

# between the means greater than or equal to 12.97.

#

# 3. Do the following 10,000 times:

# a. Shuffle the original measurements. To do this:

72 STATISTICS IS EASY!

# i. put the values from all the groups into one array but remembering the start

and end indexes of each group

# ii. shuffle the values in the array, effectively reassigning the values to

different groups

# b. Measure the difference between the two group means, just as we did in step (1).

# c. If the difference from step (3b) is greater than or equal to 12.97, increment

our counter

# from step (2). Note: if our original difference between the means were a

negative value

# we would check for values less than or equal to that value.

#

# 4. counter / 10,000 equals the probability of getting our observed difference of two

means greater than

# or equal to 12.97, if there is in fact no significant difference.

#

######################################

import random

######################################

#

# Adjustable variables

#

######################################

input_file = "input/Diff2Mean.vals"

######################################

#

# Subroutines

#

######################################

# takes a list of groups (two or more)

# pools all values, shuffles them, and makes new groups

# of same size as original groups

# returns these new groups

# example of shuffle with more than two groups: http://www.statisticsiseasy.com/code/

OneWayAnovaSig.py

def shuffle(grps):

num_grps = len(grps)

pool = []

# pool all values

for i in range(num_grps):

pool.extend(grps[i])

# mix them up

random.shuffle(pool)

# reassign to groups of same size as original groups

new_grps = []

start_index = 0

end_index = 0

APPENDIX B 73

for i in range(num_grps):

end_index = start_index + len(grps[i])

new_grps.append(pool[start_index:end_index])

start_index = end_index

return new_grps

# subtracts group a mean from group b mean and returns result

def meandiff(grpA, grpB):

return sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA))

######################################

#

# Computations

#

######################################

# list of lists

samples = []

a = 0

b = 1

# file must be in FASTA format

infile=open(input_file)

for line in infile:

if line.startswith('>'):

# start of new sample

samples.append([])

elif not line.isspace():

# line must contain values for previous sample

samples[len(samples) - 1] += map(float,line.split())

infile.close()

observed_mean_diff = meandiff(samples[a], samples[b])

count = 0

num_shuffles = 10000

for i in range(num_shuffles):

new_samples = shuffle(samples)

mean_diff = meandiff(new_samples[a], new_samples[b])

# if the observed difference is negative, look for differences that are smaller

# if the observed difference is positive, look for differences that are greater

if observed_mean_diff < 0 and mean_diff <= observed_mean_diff:

count = count + 1

elif observed_mean_diff >= 0 and mean_diff >= observed_mean_diff:

count = count + 1

######################################

#

# Output

#

######################################

Get *Statistics is Easy!* now with O’Reilly online learning.

O’Reilly members experience live online training, plus books, videos, and digital content from 200+ publishers.