Homogenizing Columns and Concatenating Data Simulated in an msBayes Analysis
msBayes is a suite of tools that support a Approximate Bayesian Computation approach for phylogeographic analysis, estimation and hypothesis testing. The following Python script can be used to ensure that various data files produced by programs in the msBayes pipeline all have the same number and type of columns.
Things to note regarding the operation of the script:
- The script depends crucially on a header row of column names as the first line of each file.
- It only inserts columns into rows of the source files so that their column pattern matches the reference or model file. It does not delete any columns. Thus, the model file should have a column pattern that is a superset of all the column patterns across the source files.
- It not only does not re-order columns in the source files, its logic depends on the column order of the source files matching the order of columns in the model file (sans missing columns, of course).
- All the above conditions are met perfectly by the output files produced by msBayes, so as long as you do not modify these files directly, or are careful to maintain the above conditions if you do modify the files, there should be no problem in the execution of the script.
#! /usr/bin/env python ############################################################################### ## Copyright 2009 Jeet Sukumaran. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program. If not, see <http://www.gnu.org/licenses/>. ## ############################################################################### """ Composes aggregate dataset out of source files, inserting null columns if neccessary so as to match named column pattern in specified model file (note that all files are assumed to have a header row as the first line). """ from optparse import OptionGroup from optparse import OptionParser import sys _prog_usage = '%prog -m <MODEL-FILE> [options] <SOURCE-FILES>' _prog_version = 'REGULARIZE-COLS Version 1.0' _prog_description = 'Composes aggregate dataset out of specified source files, ' \ + 'inserting null columns if neccessary so as to match ' \ + 'named column pattern in specified model file (note that ' \ + 'all files are assumed to have a header row as the ' \ + 'first line, and column order and names are identical across files).' _prog_author = 'Jeet Sukumaran' _prog_copyright = 'Copyright (C) 2009 Jeet Sukumaran.' def main(): """ Main CLI handler. """ parser = OptionParser(usage=_prog_usage, add_help_option=True, version=_prog_version, description=_prog_description) parser.add_option('-m', '--model', action='store', dest='model_filepath', type='string', default=None, metavar='MODEL-FILEPATH', help='file providing reference pattern of columns to match') parser.add_option('-s', '--sep', action='store', dest='separator', type='string', default='\t', metavar='COLUMN-SEPARATOR', help='character to use as column separator (default=<TAB>)') parser.add_option('-o', '--output', action='store', dest='output_filepath', type='string', default=None, metavar='OUTPUT-FILEPATH', help='output filepath (default: standard output)') parser.add_option('-d', '--dummy-value', action='store', dest='dummy_value', type='int', default=-9999, metavar='DUMMY-VALUE', help='value to use into dummy columns inserted (default: %default)') (opts, args) = parser.parse_args() if opts.model_filepath is None: sys.exit("Please specify path to a model/template file using the \"-m\" option.") if len(args) == 0: sys.exit("Please specify path to one or more files.") max_len = max([len(a) for a in args]) mf = open(opts.model_filepath, "rU") model_fields = mf.readline().strip().replace("\n", "").split(opts.separator) model_fields = [mf for mf in model_fields if mf != ""] sys.stderr.write("(found %d fields in model file)" % len(model_fields)) sys.stderr.write("\n") if opts.output_filepath is None: output = sys.stdout else: output = open(opts.output_filepath, "w") output.write(opts.separator.join(model_fields)) output.write("\n") fields = {} for fpath_idx, fpath in enumerate(args): sys.stderr.write('-- Processing %d of %d: %s\n' % (fpath_idx+1, len(args), fpath)) try: tf = open(fpath, "rU") except: sys.stderr.write(" * Cannot open file: skipping.\n") continue source_fields = tf.readline().strip().replace("\n", "").split(opts.separator) if len(source_fields) > len(model_fields): sys.stderr.write(' * ERROR: Source file has more columns (%d) than model file (%d).\n' % (len(source_fields), len(model_fields))) sys.stderr.write(", ".join(source_fields)) sys.exit(1) else: if len(source_fields) > len(model_fields): sys.stderr.write(' * Source file has more (%d) columns than model file.\n' %len(source_fields)) if len(source_fields) == len(model_fields): sys.stderr.write(' * Source file has same number of columns as model file.\n') else: sys.stderr.write(' * Source file has %d columns instead of %d.\n' % (len(source_fields), len(model_fields))) col_map = {} tf_idx = 0 mf_idx = 0 for mf_idx in range(len(model_fields)): if model_fields[mf_idx] == source_fields[tf_idx]: col_map[mf_idx] = tf_idx mf_idx += 1 tf_idx += 1 else: mf_idx += 1 if len(col_map) < len(model_fields): sys.stderr.write(' * Columns to be inserted: ') for mf_idx, field in enumerate(model_fields): if mf_idx not in col_map: sys.stderr.write('[%d]:"%s" ' % (mf_idx+1, field)) sys.stderr.write('\n') else: sys.stderr.write(" * No columns to be inserted.\n") row_idx = 1 row = tf.readline() while row != "": data_fields = row.strip().replace("\n", "").split(opts.separator) if not data_fields: continue if len(data_fields) != len(source_fields): sys.stderr.write(" * ERROR: Expecting %d fields in data row %d, but found %d.\n" % (len(source_fields), row_idx, len(data_fields))) continue output_fields = [] for mf_idx in range(len(model_fields)): if mf_idx in col_map: output_fields.append(data_fields[col_map[mf_idx]]) else: output_fields.append(str(opts.dummy_value)) assert(len(output_fields) == len(model_fields)) output.write(opts.separator.join(output_fields)) output.write("\n") row = tf.readline() row_idx += 1 if __name__ == '__main__': main()
feed
Post new comment