-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_ferm_data.py
229 lines (199 loc) · 9.64 KB
/
get_ferm_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
#####################################################################################################
This module is for getting fermentation data from the reference fermentation data excel file
Inputs(hard coded):
- location of reference fermentation data excel file
- value for minimum fraction of substrate consumed (good default value is 0.2)
- list of types of media to be considered "MTC"
Outputs:
- dataframe with ethanol yield data (mean, stdev, CV and count) for each strain
Dan Olson
12-12-2016
#####################################################################################################
"""
#----------------------------------------------------------------------------------------------------
# imports
#----------------------------------------------------------------------------------------------------
import os
import re
from scipy.stats import ttest_ind_from_stats
import numpy as np
import pandas as pd
#----------------------------------------------------------------------------------------------------
# constants and global variables
#----------------------------------------------------------------------------------------------------
path = r'\\jumbo.thayer.dartmouth.edu\lyndlab\biomass\C thermocellum reference fermentation data'
fileName = r'\Compiled data through bottle 855 v4.xlsx'
filePath = path + fileName
minSubstrateFraction = 0.2 # filter out bottles that have substrate utilization lower than this
# list of 'approved' types of MTC medium (both C. therm and T. sacch
# note that I might want to get rid of the 20 g/l bottles at some point
mtcMediumList = ['MTC',
'MTC + 0.5 g/l YE',
'MTC + 6ug/ml thiamphenicol',
'MTC 5 + 0.9g/L yeast extract',
'MTC with less N2',
'MTC+ RPMI vitamins and MEM amino acids',
'MTC-5',
'MTC-5 + 20g/L Cellobiose',
'MTC-5 + 20g/L Cellobiose, 6?g/ml Thiamphenicol',
'MTC-5 + 20g/L Cellobiose, 6?g/ml Thiamphenicol, H2 purged with H2 headspace, stationary incubation in chamber',
'MTC-5 + 20g/L Cellobiose, 6?g/ml Thiamphenicol, N2 purged, chamber depressurized, stationary incubation in chamber',
'MTC-5 + 20g/L Cellobiose, 6?g/ml Thiamphenicol, triple purged with chamber gas, stationary incubation in chamber',
'MTC-5 + 20g/L Cellobiose, 6?g/ml Thiamphenicol, triple purged with chamber gas, stationary incubation in chamber, left vented in chamber',
'MTC-5 + 6µg/ml thiamphenicol',
'MTC-5 + 6ug/ml Thiamphenicol',
'MTC-5, new cellobiose',
'MTC-6',
'MTC-6 (+4mM formate)',
'MTC-6 low cellobiose',
'MTC-6, 2.5 g/l cb',
'MTC-6, 2.5 g/l cellobiose',
'MTC-6, new cellobiose',
'MTC-6 + 0.5g/L YE',
'T. sacch MTC']
cleanFermData = None #global variable to hold fermentation data
rawFermData = None
#----------------------------------------------------------------------------------------------------
# functions
#----------------------------------------------------------------------------------------------------
def getStrainID(inStr):
"""
get strain ID
should start with LL or AG
then 3-4 digits
"""
result = ''
p = re.compile('((AG|LL)\d{3,4})')
if type(inStr) is str:
regexResult = p.findall(inStr)
if len(regexResult) > 0:
result = regexResult[0][0]
return result
def setup(fp = filePath):
"""
read excel file
filter bottles
get LL number
return cleanFermData dataframe
also save cleanFermData as a global variable """
# read data from excel file and set up new columns
df =pd.read_excel(fp, sheetname='bottles', skiprows=19)
df.set_index('Bottle ID', inplace=True)
df['substrateFracUsed'] = df['cellobiose consumed']/df['initial substrate (cellobiose)']
df['StrainID'] = df['LL number'].apply(getStrainID) # get rid of the 'LL' from LL numbers
global totalBottles
totalBottles = len(df)
# generate boolean filters for getting rid of rows
global nonRejected
global hasEtohYield
global hasStrainId
global enoughSubstrateUsed
global isMtcMedium
blankRows = (df['Media'].isnull()) & (df['Blank ID'].isnull()) & (df['Bottle name'].isnull()) & (df['Strain description'].isnull())
nonRejected = ~(df['Reject from analysis'].notnull())
hasEtohYield = df['Yet'].notnull()
hasStrainId = df['StrainID'] != ''
enoughSubstrateUsed = df['substrateFracUsed'] > minSubstrateFraction
isMtcMedium = df['Media'].isin(mtcMediumList)
# store the raw bottle results in a dataframe for manual inspection, if needed
global rawFermData
rawFermData = df
rawFermData['nonRejected'] = nonRejected
rawFermData['hasEtohYield'] = hasEtohYield
rawFermData['hasStrainId'] = hasStrainId
rawFermData['enoughSubstrateUsed'] = enoughSubstrateUsed
rawFermData['isMtcMedium'] = isMtcMedium
# create a new dataframe with the non-rejected data
df0 = df.loc[~blankRows, :]
df1 = df0.loc[nonRejected, :]
df2 = df1.loc[hasEtohYield, :]
df3 = df2.loc[hasStrainId, :]
df4 = df3.loc[enoughSubstrateUsed, :]
df5 = df4.loc[isMtcMedium , :]
# keep columns from 'carbon balance' to 'StrainID'
global cleanFermData
cleanFermData = df5.loc[:, 'carbon balance':'StrainID']
#cleanFermData = df.loc[(nonRejected
# & hasEtohYield
# & hasStrainId
# & isMtcMedium
# & enoughSubstrateUsed), 'carbon balance':'StrainID']
# summarize cleaning
print('*********** CLEANING SUMMARY ****************')
print('initial number of bottles = ', len(df))
print('empty rows removed = ', len(df) - len(df0))
print('number rejected = ', len(df0) - len(df1))
print('rejected for no ethanol (blk bottles) = ', len(df1) - len(df2))
print('rejected for no strain ID = ', len(df2) - len(df3))
print('rejected for low substrate utilization = ', len(df3) - len(df4))
print('rejected for non-MTC medium = ', len(df4) - len(df5))
print('\nfinal number of bottles = ', len(df5))
print('*********************************************')
return cleanFermData
def getAllAgFermData(fermProdList, fermData, parChiDf):
"""
get aggregated fermentation data for all products in fermProdList
"""
#fermProdList = ['carbon balance', 'YpelC', 'Yform', 'Ylac', 'Yac', 'Yet', 'Ypyr',
# 'Ymal', 'Yh2', 'PforFlux']
agFermDict = {}
for prod in fermProdList:
data = getAgFermData(prod, fermData, parChiDf)
agFermDict[prod] = data
return agFermDict
def getAgFermData(colName, fermData, parChildDf):
"""
get aggregated fermentation data
available columns include:
carbon balance
YpelC (yield of pellet carbon)
Yform (yield of formate)
Ylac (yield of lactate)
Yac (yield of acetate)
Yet (yield of ethanol)
Ypyr (yield of pyruvate)
Ymal (yield of malate)
Yh2 (yield of hydrogen)
PforFlux (PFOR flux = Yac + Yet - Yform)
substrateFracUsed (fraction of initial cellobiose consumed)
colName is the name of the column to aggreagate
parChildDf is a dataframe with parent and child columns
index is the strain ID as a string
ParentID is the parent strain ID as a string"""
# check to make sure colName is actually the name of a column in cleanFermData
if colName in fermData.columns.tolist():
aggData = fermData.groupby('StrainID').agg(['mean', 'std', 'count'])[colName]
aggData['cv'] = aggData['std']/aggData['mean']
# merge aggregated data with parChildDf based on parent
result = pd.merge(parChildDf, aggData, how='left', left_on='StrainID', right_index=True)
# merge parent strain data
result = pd.merge(result, aggData, how='left', left_on='ParentID', right_index=True, suffixes=('_chi', '_par'))
#ratioName = colName + '_ratio' # easier not to have this name change
ratioName = 'ratio'
result[ratioName] = result['mean_chi'] / result['mean_par']
# perform T-test based on summary statistics and save result in a new column
# equal_var = False uses Welch's t-test
# I think this is a correct assumption, but might want to double check
# The resulting p-values are slightly larger than when equal_var = True
ttestResult = pd.Series(''*len(result))
for index, row in result.iterrows():
#print(row)
#print(index, ' ', row['count_chi'], ' ',row['count_par'] )
if ((row['count_chi'] > 1)
and (row['count_par'] > 1)
and (row['mean_chi'] > 0)
and (row['mean_par'] > 0)):
(stat, pval) = ttest_ind_from_stats(row['mean_chi'], row['std_chi'], row['count_chi'],
row['mean_par'], row['std_par'], row['count_par'],
equal_var = False)
else:
pval = np.nan
ttestResult.loc[index] = pval
#pvalName = colName + 'pval'
pvalName = 'pval'
result[pvalName] = ttestResult
result.sort_values(pvalName, inplace=True)
# add log2 column
result['logRatio'] = np.log2(result['ratio'])
return result