Here we will go through the process of writing a parser that parses data
from an arbitrary format, in this case the format is provided in the
file new_parser_data.xlsx
and consists of one sheet with data
grouped by time point, and another sheet with the identifiers
Generally, it makes sense to start with one of the existing parsers as a
guide. In this case, the spectromax_OD
parser would be most
relevant. The sample data is available in
tests/test_data/sample_parser_docs_data.xlsx
In [1]:
from impact.parsers import Parser, parse_raw_identifier, parse_time_point_list
from impact import TimePoint
def new_parser(experiment, data, id_type='traverse'):
# Define the layout of our data
first_row_index = 0
plate_size = 8
spacing = 1
time_row_col = [0,0]
data_row_col = [1,0]
# Define the type of data this parser accepts
analyte_name = 'OD600'
analyte_type = 'biomass'
# In this case, we can first prepare the data by extracting the relevant information from each sheet
unparsed_identifiers = data['identifiers']
raw_data = data['data']
# The data starts at (1,1) and is in a 8x12 format
timepoint_list = []
# We first parse the identifiers, as these can be recycled (the only thing that is changing is the time)
identifiers = []
for i, row in enumerate(unparsed_identifiers):
parsed_row = []
for j, data in enumerate(row):
# Here we can implement logic to exclude any data which is not present, for example when a plate is not full
# In this case, any cell which is empty, 0, or None will be excluded
if unparsed_identifiers[i][j] not in ['', 0, '0', None]:
temp_trial_identifier = parse_raw_identifier(unparsed_identifiers[i][j], id_type)
parsed_row.append(temp_trial_identifier)
else:
parsed_row.append(None)
identifiers.append(parsed_row)
for start_row_index in range(first_row_index, len(raw_data), plate_size+spacing):
if raw_data[start_row_index][0] != '~End':
time = int(raw_data[start_row_index+time_row_col[0]][time_row_col[1]])
# Define the data for a single plate, single timepoint
plate_data = [row[2:14] for row in raw_data[start_row_index:start_row_index+plate_size]]
# Load the data point by point
for i, row in enumerate(plate_data):
for j, data in enumerate(row):
# Skip wells where no identifier is listed or no data present
if identifiers[i][j] is not None and data not in [None,'']:
ti = identifiers[i][j]
ti.analyte_type, ti.analyte_name = analyte_type, analyte_name
time_point = TimePoint(ti, time, float(data))
timepoint_list.append(time_point)
else:
break
# Finally we parse all of the time points (into their logical strucutre based on identifiers)
# And add them to the experiment
replicate_trial_list = parse_time_point_list(timepoint_list)
for rep in replicate_trial_list:
experiment.add_replicate_trial(rep)
C:\Users\Naveen\Anaconda3\lib\site-packages\IPython\html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
"`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
With the new parser defined, we can register it to the Parser class, and
directly parse our data. The parser will return an Experiment
instance, containing all the data.
In [2]:
Parser.register_parser('my_new_format',new_parser)
expt = Parser.parse_raw_data('my_new_format',file_name='../tests/test_data/sample_parser_docs_data.xlsx')
Importing data from ../tests/test_data/sample_parser_docs_data.xlsx...0.0s
Parsing time point list...Parsed 246 time points in 0.2s
Parsing analyte list...Parsed 82 analytes in 482.0ms
Parsing single trial list...Parsed 32 replicates in 0.1s
In [3]:
print(expt)
strain media environment analytes
----------------- -------------------- ------------- ----------
3KO-D1 + pKDL071 Base + 1.0 a.u. aTc ['OD600']
3KO-D1 + pKDL071 Base + 2.0 a.u. IPTG ['OD600']
3KO-D28 + pKDL071 Base + 1.0 a.u. aTc ['OD600']
3KO-D28 + pKDL071 Base + 2.0 a.u. IPTG ['OD600']
3KO-D59 + pKDL071 Base + 1.0 a.u. aTc ['OD600']
3KO-D59 + pKDL071 Base + 2.0 a.u. IPTG ['OD600']
IMPT1 + pIMPT001 Base + 1.0 a.u. aTc ['OD600']
IMPT1 + pIMPT001 Base + 2.0 a.u. IPTG ['OD600']
IMPT1 + pIMPT002 Base + 1.0 a.u. aTc ['OD600']
IMPT1 + pIMPT002 Base + 2.0 a.u. IPTG ['OD600']
IMPT1 + pIMPT003 Base + 1.0 a.u. aTc ['OD600']
IMPT1 + pIMPT003 Base + 2.0 a.u. IPTG ['OD600']
IMPT1 + pIMPT004 Base + 1.0 a.u. aTc ['OD600']
IMPT1 + pIMPT004 Base + 2.0 a.u. IPTG ['OD600']
IMPT2 + pIMPT001 Base + 1.0 a.u. aTc ['OD600']
IMPT2 + pIMPT001 Base + 2.0 a.u. IPTG ['OD600']
IMPT2 + pIMPT002 Base + 1.0 a.u. aTc ['OD600']
IMPT2 + pIMPT002 Base + 2.0 a.u. IPTG ['OD600']
IMPT2 + pIMPT003 Base + 1.0 a.u. aTc ['OD600']
IMPT2 + pIMPT003 Base + 2.0 a.u. IPTG ['OD600']
IMPT2 + pIMPT004 Base + 1.0 a.u. aTc ['OD600']
IMPT2 + pIMPT004 Base + 2.0 a.u. IPTG ['OD600']
IMPT3 + pIMPT001 Base + 1.0 a.u. aTc ['OD600']
IMPT3 + pIMPT001 Base + 2.0 a.u. IPTG ['OD600']
IMPT3 + pIMPT002 Base + 1.0 a.u. aTc ['OD600']
IMPT3 + pIMPT002 Base + 2.0 a.u. IPTG ['OD600']
IMPT3 + pIMPT003 Base + 1.0 a.u. aTc ['OD600']
IMPT3 + pIMPT003 Base + 2.0 a.u. IPTG ['OD600']
IMPT3 + pIMPT004 Base + 1.0 a.u. aTc ['OD600']
IMPT3 + pIMPT004 Base + 2.0 a.u. IPTG ['OD600']
dlacI + pKDL071 Base + 1.0 a.u. aTc ['OD600']
dlacI + pKDL071 Base + 2.0 a.u. IPTG ['OD600']