Easy Rejection Sampling in msBayes
msBayes is a suite of tools that support a Approximate Bayesian Computation approach for phylogeographic analysis, estimation and hypothesis testing.
The following Python script facilitates the execution of the rejection sampling step of the analysis, by wrapping the invocation to the "msReject" program, automatically identifying the summary statistic columns in the data files and composing the correct command arguments.
The script depends crucially on a header row of column names as the first line of the files, and assumes that all prior/parameter column names begin with the string "PRI.".
#! /usr/bin/env python ############################################################################### ## Copyright 2009 Jeet Sukumaran. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program. If not, see <http://www.gnu.org/licenses/>. ## ############################################################################### """ Performs accept/reject sampling on a file of simulations from priors to produce set of samples the posterior distribution. (1) Identifies summary statistic columns in a given data file (based on column names that do not begin with "PRI.". (2) Calls msReject with the appriopriate parameters. """ from optparse import OptionGroup from optparse import OptionParser import subprocess import sys import os _prog_usage = '%prog [options] <OBS-DATA-FILE> <SIM-DATA-FILE>' _prog_version = 'SAMPLE-POSTERIORS Version 1.0' _prog_description = """Performs accept/reject sampling on a file of simulations from priors to produce set of samples the posterior distribution.""" _prog_author = 'Jeet Sukumaran' _prog_copyright = 'Copyright (C) 2009 Jeet Sukumaran.' def parse_fields(row, separator="\t", include_empty=False): fields = row.strip().replace("\n", "").split(separator) if not include_empty: fields = [f for f in fields if f != ""] return fields def main(): """ Main CLI handler. """ parser = OptionParser(usage=_prog_usage, add_help_option=True, version=_prog_version, description=_prog_description) parser.add_option('-a', '--application', action='store', dest='app_path', type='string', # also 'float', 'string' etc. default="msReject", metavar='MSREJECT-PATH', help='path to msReject (default="%default")') parser.add_option('-t', '--tolerance', action='store', dest='tolerance', type='float', # also 'float', 'string' etc. default=0.02, metavar='TOLERANCE', help='sampling tolerance (default=%default)') parser.add_option('-s', '--sep', action='store', dest='separator', type='string', # also 'float', 'string' etc. default='\t', metavar='COLUMN-SEPARATOR', help='character to use as column separator (default=<TAB>)') parser.add_option('-n', '--no-execute', action='store_true', dest="dry_run", default=False, help='print command to standard output, but do not actually run') parser.add_option('-q', '--quiet', action='store_true', dest="quiet", default=False, help='run without any supplemental messages') (opts, args) = parser.parse_args() if len(args) < 2: sys.exit("Paths to observed and simulated data files not specified.") obs_fpath = os.path.expandvars(os.path.expanduser(args[0])) sim_fpath = os.path.expandvars(os.path.expanduser(args[1])) obs_file = open(obs_fpath, "rU") sim_file = open(sim_fpath, "rU") obs_fields = parse_fields(obs_file.readline(), opts.separator) sim_fields = parse_fields(sim_file.readline(), opts.separator) if len(obs_fields) != len(sim_fields): sys.exit("ERROR: Number of columns in observed data file (%d) does not equal number of columns in simulated data file (%d)." % (len(obs_fields), len(sim_fields))) param_cols = [] sum_stat_cols = [] for i, f in enumerate(obs_fields): if obs_fields[i] != sim_fields[i]: sys.exit('ERROR: Column %d does not correspond between observed and simulated files ("%s" vs. "%s").' % (i+1, obs_fields[i], sim_fields[i])) if not f.startswith("PRI."): sum_stat_cols.append(i) else: param_cols.append(i) if not opts.quiet: sys.stderr.write("Following columns identified as PARAMETERS:\n") for c in param_cols: sys.stderr.write(' [%d]:"%s"\n' % (c+1, obs_fields[c])) sys.stderr.write("\nFollowing columns identified as SUMMARY STATISTICS:\n") for c in sum_stat_cols: sys.stderr.write(' [%d]:"%s"\n' % (c+1, obs_fields[c])) sys.stderr.write('\n') cmd = (opts.app_path + " " + obs_fpath + " " + sim_fpath + " " + str(opts.tolerance) + " " + " ".join([str(i+1) for i in sum_stat_cols])) if opts.dry_run: sys.stdout.write(cmd + "\n") sys.exit(0) if not opts.quiet: sys.stderr.write("\nCommand to be executed:\n") sys.stderr.write(' %s\n' % cmd) msreject = subprocess.Popen(cmd, shell=True) if not opts.quiet: sys.stderr.write('\nRunning with PID: %d.\n' % msreject.pid) msreject.wait() if not opts.quiet: sys.stderr.write("Completed with exit status: %s.\n" % msreject.returncode) if __name__ == '__main__': main()
feed
Comments
0 comments postedPost new comment