#!/usr/bin/python import argparse import os def calculate(splited, system): """function to calculate results""" matrix = [] for i in range(system['mols']): matrix.append(0) for file in splited: for aggl in file: matrix[len(aggl)-1] += 1 results = {} while matrix[-1] == 0: matrix.pop(-1) for i, item in enumerate(matrix, start = 1): results[i] = item summ = 0 for i in results: if i == 1: continue results[i] = i * results[i] / (system['steps'] * system['mols']) summ += results[i] results[1] = 1.0 - summ return results def parse_file(splited): """function to parse input file""" for file in splited: for agl in file: for i, molecule in enumerate(agl): agl[i] = [int(molecule.split('=')[0]), [int(conn) for conn in molecule.split('=')[1].split(',') if conn]] return splited def print_result(output, matrix): """function to print result to file""" with open(output, 'w') as out: out.write("| n | p |\n-----------------\n") for i in matrix: out.write(" %7i %7.5f \n" % (i, matrix[i])) out.write("-----------------\n") def read_file(input_file): """function to read file from statgen""" file = open(input_file).read() if not file.startswith("statgen"): print("It is not a statgen file") exit(2) prepare = file.split("\n") system = {'steps': 0, 'mols': 0} while not prepare[-1].startswith("SUMMARY STATISTIC"): statistic = prepare.pop(-1) if not statistic.startswith(" 1"): continue values = [float(item) for item in statistic.split()] system['steps'] = int(round(values[1] / values[2], 0)) system['mols'] = int(round(values[2] / values[3], 0)) prepare.pop(-1) splited = '\n'.join(prepare).split("FILE")[1:] for i, item in enumerate(splited): text = item.split("AGL")[1:] for j, agl in enumerate(text): text[j] = [single for single in agl.split("\n")[1:] if single] text[-1].pop(-1) splited[i] = text return splited, system def select_aggl(splited, step): """function to select agglomerates""" for i, file in enumerate(splited): first = (i - step) if (i - step) > 0 else 0 last = (i + step) if (i + step) < (len(splited) - 1) else (len(splited) - 1) for j, aggl in enumerate(file): for k, mol in enumerate(aggl): table = [] for other_file in splited[first:last]: for aggl_other in other_file: if not mol[0] in [other_single[0] for other_single in aggl_other]: continue table.extend(mols for other_single in aggl_other for mols in other_single[1] if other_single[0] == mol[0]) for l, conn in enumerate(mol[1]): if table.count(conn) == last - first: continue splited[i][j][k][1].pop(l) if len(mol[1]) == 0: splited[i][j].pop(k) if len(aggl) == 0: splited[i].pop(j) return splited if __name__ == "__main__": parser = argparse.ArgumentParser(description = "Sort agglomerates using dynamic criteria") parser.add_argument('input', help = "file from statgen") parser.add_argument('-s', '--step', type = int, help = "step to check agglomerates", default = 1) parser.add_argument('-o', '--output', help = "output file", default = "dhb-output.dat") args = parser.parse_args() if not os.path.isfile(args.input): print("Could not find file %s" % args.input) exit(1) splited, system = read_file(args.input) splited = parse_file(splited) matrix = calculate(select_aggl(splited, args.step), system) print_result(args.output, matrix)