APEMoST Code

Brought to you by: johannesbuchner, mgrub83
[de244b]: / src / analyse.c Maximize Restore History

287 lines (253 with data), 8.2 kB

/*
    APEMoST - Automated Parameter Estimation and Model Selection Toolkit
    Copyright (C) 2009  Johannes Buchner

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <omp.h>

#include "mcmc.h"
#include "parallel_tempering_config.h"
#include "debug.h"
#include "define_defaults.h"
#include "gsl_helper.h"
#include "parallel_tempering_run.h"
#include "histogram.h"
#include "utils.h"

/*
 * calculate data probability
 */
void analyse_data_probability() {
	unsigned int i;
	unsigned int j;
	unsigned long n = 0;
	double v;
	double w;
	double sums[100];
	double previous_beta;
	double data_logprob;
	char buf[100];
	FILE * f;
	unsigned int n_beta = N_BETA;
	mcmc ** chains = setup_chains();

	read_calibration_file(chains, n_beta);

	assert(n_beta < 100);
	for (i = 0; i < n_beta; i++) {
		sprintf(buf, "prob-chain%d.dump", i);
		dump_s("summing up probability file", buf);
		printf("reading probabilities of chain %d\r", i);
		fflush(stdout);
		f = fopen(buf, "r");
		if (f == NULL) {
			fprintf(stderr,
					"calculating data probability failed: file %s not found\n",
					buf);
			return;
		}
		n = 0;
		sums[i] = 0;
		while (!feof(f)) {
			if (fscanf(f, "%le\t%le", &w, &v) == 2) {
				/*
				 * note: rounding errors could occur here
				 * since the values are of the same magnitude, they hopefully won't
				 */
				sums[i] += v;
				n++;
			}
		}
		if (n == 0) {
			fprintf(stderr, "calculating data probability failed: "
				"no data points found in %s\n", buf);
			return;
		}
		sums[i] = sums[i] / get_beta(chains[i]) / n;
	}

	data_logprob = 0;
	previous_beta = 0;
	/* calculate the integral by an estimate */
	for (j = n_beta - 1;; j--) {
		assert((get_beta(chains[j]) > previous_beta));

		data_logprob += sums[j] * (get_beta(chains[j]) - previous_beta);

		if (j == 0)
			break;
		previous_beta = get_beta(chains[j]);
	}

	printf("Model probability ln(p(D|M, I)): [about 10^%.0f] %.5f"
		"\n"
		"\nTable to compare support against other models (Jeffrey):\n"
		" other model ln(p(D|M,I)) | supporting evidence for this model\n"
		" --------------------------------- \n"
		"        >  %04.1f \tnegative (supports other model)\n"
		"  %04.1f .. %04.1f \tBarely worth mentioning\n"
		"  %04.1f .. %04.1f \tSubstantial\n"
		"  %04.1f .. %04.1f \tStrong\n"
		"  %04.1f .. %04.1f \tVery strong\n"
		"        <  %04.1f \tDecisive\n", data_logprob / gsl_sf_log(10),
			data_logprob, data_logprob, data_logprob, 
			data_logprob - gsl_sf_log(3), data_logprob - gsl_sf_log(3), 
			data_logprob - gsl_sf_log(10), data_logprob - gsl_sf_log(10),
			data_logprob - gsl_sf_log(30), data_logprob - gsl_sf_log(30),
			data_logprob - gsl_sf_log(100), data_logprob - gsl_sf_log(100)
       );
       printf("\nbe careful.\n");
}

double calc_mcmc_error(const double mean, const char * filename,
		unsigned long batchsize) {
	double v;
	unsigned long n = 0;
	int nbatches = 0;
	FILE * f = openfile(filename);
	double batchsum = 0;
	double batchmean;
	double batcherror;
	double errorsum = 0;

	while (!feof(f)) {
		if (fscanf(f, "%lf", &v) == 1) {
			n++;
			batchsum += v;
			if (n % batchsize == batchsize - 1) {
				batchmean = batchsum / batchsize;
				/*printf("batchmean: %f (batchsize = %lu, batchnr %d)\n",
				 batchmean, batchsize, nbatches);*/
				batcherror = /*batchsize * */pow(batchmean - mean, 2);
				errorsum += batcherror;
				batchsum = 0;
				nbatches++;
			}
		}
	}
	return sqrt(errorsum / nbatches);
}

#ifndef NBINS
#define NBINS 200
#endif

#ifdef __NEVER_SET_FOR_DOCUMENTATION_ONLY
/**
 * If not set, the marginal distribution will be calculated for the whole
 * parameter range. Pros: faster, comparable. Cons: Not as detailed.
 *
 * If set, the maximum and minimum values found are used for
 * the histogram. Pros: more detailed in the area of interest
 */
#define HISTOGRAMS_MINMAX
#endif

void calc_marginal_distribution(mcmc ** chains, unsigned int n_beta,
		unsigned int param, int find_minmax) {
	const unsigned int nbins = NBINS;
	char ** filenames;
	unsigned int filecount = n_beta;

	unsigned int i;
	gsl_vector * min;
	gsl_vector * max;
	gsl_histogram * h;
	FILE * outfile;
	double iter;
	double mean;
	double sigma;
	double mcmcerror;
	char outfilename[100];
	const char * paramname = get_params_descr(chains[0])[param];

#ifdef HISTOGRAMS_ALLCHAINS
	filecount = n_beta;
#else
	filecount = 1;
#endif

	filenames = (char **) calloc(filecount + 1, sizeof(char*));
	for (i = 0; i < filecount; i++) {
		filenames[i] = (char *) malloc(100 * sizeof(char));
		sprintf(filenames[i], "%s-chain-%d.prob.dump", paramname, i);
	}
	sprintf(outfilename, "%s.histogram", paramname);

	min = gsl_vector_alloc(1);
	max = gsl_vector_alloc(1);

	gsl_vector_set(min, 0, get_params_min_for(chains[0], param));
	gsl_vector_set(max, 0, get_params_max_for(chains[0], param));

	if (find_minmax != 0) {
		for (i = 0; i < filecount; i++) {
			printf("minmax search : chain %3d parameter %s   \r", i, paramname);
			fflush(stdout);
			if (1 != get_column_count(filenames[i])) {
				fprintf(
						stderr,
						"number of columns different in file %s: %i vs %i in %s\n",
						filenames[i], 1, get_column_count(filenames[i]),
						filenames[0]);
				exit(1);
			}
			if (i == 0)
				find_min_max(filenames[0], min, max);
			else
				update_min_max(filenames[i], min, max);
		}
		dump_v("minima", min);
		dump_v("maxima", max);
	}
	h = create_hist(nbins, gsl_vector_get(min, 0), gsl_vector_get(max, 0));
	debug("filling histogram... ");
	for (i = 0; i < filecount; i++) {
		printf("reading values: chain %3d parameter %s   \r", i, paramname);
		fflush(stdout);
		dump_s("with file", filenames[i]);
		append_to_hists(&h, 1, filenames[i]);
	}
	iter = gsl_histogram_sum(h);
	gsl_histogram_scale(h, (gsl_vector_get(max, 0) - gsl_vector_get(min, 0))
			/ nbins / iter);

	outfile = fopen(outfilename, "w");
	debug("writing histogram... ");
	assert(outfile != NULL);
	gsl_histogram_fprintf(outfile, h, DUMP_FORMAT, DUMP_FORMAT);
	dump_s("histogram file done", outfilename);
	fclose(outfile);

	mean = gsl_histogram_mean(h);
	sigma = gsl_histogram_sigma(h);
	for (i = 0; i < filecount; i++) {
		mcmcerror = calc_mcmc_error(mean, filenames[i], sqrt(iter));
		printf("mcmc error estimate of %s: %f %s\n", paramname, mcmcerror,
				(mcmcerror > sigma * 0.01 ? "** high!" : " (ok)"));
		free(filenames[i]);
	}
	printf("Note: Include a error estimate in your publication!\n");
	free(filenames);

	gsl_histogram_free(h);
}

#ifndef GNUPLOT_STYLE
#define GNUPLOT_STYLE "with histeps"
#endif

void analyse_marginal_distributions() {
	unsigned int i;
	int find_minmax = 0;
	unsigned int n_beta = N_BETA;
	mcmc ** chains = setup_chains();
	FILE * plotplate;

	read_calibration_file(chains, n_beta);

#ifdef HISTOGRAMS_MINMAX
	find_minmax = 1;
#endif

	for (i = 0; i < get_n_par(chains[0]); i++) {
		calc_marginal_distribution(chains, n_beta, i, find_minmax);
	}

	plotplate = fopen("marginal_distributions.gnuplot", "w");
	assert(plotplate != NULL);
	fprintf(plotplate, "# set terminal png size %d,%d; set output "
		"\"marginal_distributions.png\"\n", 600, 300 * get_n_par(chains[0]));
	fprintf(plotplate, "set multiplot\n");
	fprintf(plotplate, "set size 1,%f\n", 1. / get_n_par(chains[0]));
	for (i = 0; i < get_n_par(chains[0]); i++) {
		fprintf(plotplate, "set origin 0,%f\n", (get_n_par(chains[0]) - i - 1)
				* 1. / get_n_par(chains[0]));
		fprintf(plotplate, "plot \"%s.histogram\" u 1:3 title \"%s\" "
		GNUPLOT_STYLE
		"\n", get_params_descr(chains[0])[i], get_params_descr(chains[0])[i]);
	}
	fprintf(plotplate, "unset multiplot\n");
	fclose(plotplate);
}
APEMoST Code

Branches

[de244b]: / src / analyse.c Maximize Restore History

287 lines (253 with data), 8.2 kB