diff options
-rw-r--r-- | .SRCINFO | 37 | ||||
-rw-r--r-- | Makefile.patch | 88 | ||||
-rw-r--r-- | PKGBUILD | 72 | ||||
-rw-r--r-- | alphabet.cpp.patch | 11 | ||||
-rw-r--r-- | alphabet.h.patch | 11 | ||||
-rw-r--r-- | centrifuge-build.patch | 8 | ||||
-rw-r--r-- | centrifuge-inspect.patch | 8 | ||||
-rw-r--r-- | centrifuge_evaluate.py.patch | 135 | ||||
-rw-r--r-- | centrifuge_evaluate_mason.py.patch | 127 | ||||
-rw-r--r-- | centrifuge_simulate_reads.py.patch | 143 |
10 files changed, 618 insertions, 22 deletions
@@ -1,14 +1,37 @@ pkgbase = centrifuge - pkgdesc = Bioinformatics taxonomic classifier for microbial classification + pkgdesc = Rapid and memory-efficient tool for classification of metagenomic sequences pkgver = 1.0.4 - pkgrel = 1 + pkgrel = 2 url = https://ccb.jhu.edu/software/centrifuge/ arch = x86_64 license = GPL3 - depends = libpthread-stubs - depends = zlib - source = centrifuge-1.0.4.tar.gz::https://github.com/infphilo/centrifuge/archive/v1.0.4-beta.tar.gz - sha256sums = 64eb3aa3461d27462357811832f39a8f85702eb536482f1e67344761ad8ca757 + makedepends = git + makedepends = inetutils + makedepends = pandoc-cli + depends = glibc + depends = gcc-libs + depends = python + depends = perl + depends = bash + depends = jellyfish + depends = hisat2 + source = centrifuge-1.0.4.tar.gz::https://github.com/DaehwanKimLab/centrifuge/archive/refs/tags/v1.0.4.tar.gz + source = alphabet.cpp.patch + source = alphabet.h.patch + source = centrifuge-build.patch + source = centrifuge_evaluate_mason.py.patch + source = centrifuge_evaluate.py.patch + source = centrifuge-inspect.patch + source = centrifuge_simulate_reads.py.patch + source = Makefile.patch + sha256sums = 929daed0f84739f7636cc1ea2757527e83373f107107ffeb5937a403ba5201bc + sha256sums = ae9334ddd0cb9b09811969c151350c4e5ee73452fe5d00f01fd7cf23d6573d78 + sha256sums = f681bd2fd89429245ff092b689d53c7e119a5f1270a414b8629896fd5ced90f5 + sha256sums = 12be02a7c3b63679703874b45b6491c81402bac7159859936fe4c78a47632ded + sha256sums = ee6e4c8d8b3d12c141bfeff1aa8c7afd027259d8aa0d1380f6854d4619d868ad + sha256sums = ad05064999c0da91866c82063e42ef60107142acc878fde0342bd629d26d3e68 + sha256sums = bf19a2b9e4ef745227eb593d93543020d31726e38d000c6a0091aeb0c3931ffc + sha256sums = f4268c3aa84fc1e14281470251217c1a4a36317c6ad93fc772ea4a46218a3a67 + sha256sums = e523ba32ad8163db09fd516a51756f726434dbbe2a2dd9697a5e6a6d606097c1 pkgname = centrifuge - diff --git a/Makefile.patch b/Makefile.patch new file mode 100644 index 000000000000..89ab3540e169 --- /dev/null +++ b/Makefile.patch @@ -0,0 +1,88 @@ +--- a/Makefile 2021-08-17 00:18:48.000000000 +0530 ++++ b/Makefile 2023-04-14 23:02:47.795861027 +0530 +@@ -141,7 +141,7 @@ + #GIT_VERSION = $(shell command -v git 2>&1 > /dev/null && git describe --long --tags --dirty --always --abbrev=10 || cat VERSION) + + # Convert BITS=?? to a -m flag +-BITS=32 ++BITS= + ifeq (x86_64,$(shell uname -m)) + BITS=64 + endif +@@ -160,15 +160,15 @@ + ifeq (64,$(BITS)) + BITS_FLAG = -m64 + endif +-SSE_FLAG=-msse2 ++SSE_FLAG= + + DEBUG_FLAGS = -O0 -g3 $(BIToS_FLAG) $(SSE_FLAG) -std=c++11 + DEBUG_DEFS = -DCOMPILER_OPTIONS="\"$(DEBUG_FLAGS) $(EXTRA_FLAGS)\"" +-RELEASE_FLAGS = -O3 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3 -std=c++11 ++RELEASE_FLAGS = -O2 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3 -std=c++11 + RELEASE_DEFS = -DCOMPILER_OPTIONS="\"$(RELEASE_FLAGS) $(EXTRA_FLAGS)\"" + NOASSERT_FLAGS = -DNDEBUG + FILE_FLAGS = -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE +-CFLAGS = ++#CFLAGS = + #CFLAGS = -fdiagnostics-color=always + + ifeq (1,$(USE_SRA)) +@@ -255,7 +255,8 @@ + $(CFLAGS) \ + $(PREF_DEF) \ + $(MM_DEF) \ +- $(SHMEM_DEF) ++ $(SHMEM_DEF) \ ++ $(LDFLAGS) + + # + # centrifuge targets +@@ -363,11 +364,11 @@ + + centrifuge-build.bat: + echo "@echo off" > centrifuge-build.bat +- echo "python %~dp0/centrifuge-build %*" >> centrifuge-build.bat ++ echo "python3 %~dp0/centrifuge-build %*" >> centrifuge-build.bat + + centrifuge-inspect.bat: + echo "@echo off" > centrifuge-inspect.bat +- echo "python %~dp0/centrifuge-inspect %*" >> centrifuge-inspect.bat ++ echo "python3 %~dp0/centrifuge-build %*" >> centrifuge-inspect.bat + + + .PHONY: centrifuge-src +@@ -409,24 +410,24 @@ + MANUAL: MANUAL.markdown + perl doc/strip_markdown.pl < $^ > $@ + +-prefix=/usr/local ++prefix=/usr/ + + .PHONY: install + install: all +- mkdir -p $(prefix)/bin +- mkdir -p $(prefix)/share/centrifuge/indices +- install -m 0644 indices/Makefile $(prefix)/share/centrifuge/indices +- install -d -m 0755 $(prefix)/share/centrifuge/doc +- install -m 0644 doc/* $(prefix)/share/centrifuge/doc ++ mkdir -p $(DESTDIR)$(prefix)/bin ++ mkdir -p $(DESTDIR)$(prefix)/share/centrifuge/indices ++ install -m 0644 indices/Makefile $(DESTDIR)$(prefix)/share/centrifuge/indices ++ install -d -m 0755 $(DESTDIR)$(prefix)/share/doc/centrifuge ++ install -m 0644 doc/* $(DESTDIR)$(prefix)/share/doc/centrifuge + for file in $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_SCRIPT_LIST); do \ +- install -m 0755 $$file $(prefix)/bin ; \ ++ install -m 0755 $$file $(DESTDIR)$(prefix)/bin ; \ + done + + .PHONY: uninstall + uninstall: all + for file in $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_SCRIPT_LIST); do \ +- rm -v $(prefix)/bin/$$file ; \ +- rm -v $(prefix)/share/centrifuge; \ ++ rm -v $(DESTDIR)$(prefix)/bin/$$file ; \ ++ rm -v $(DESTDIR)$(prefix)/share/centrifuge; \ + done + + @@ -1,27 +1,69 @@ -# Maintainer: Clint Valentine <valentine.clint@gmail.com> +# Maintainer: Bipin Kumar <kbipinkumar@pm.me> +# Previous Maintainer: Clint Valentine <valentine.clint@gmail.com> pkgname=centrifuge pkgver=1.0.4 -pkgrel=1 -pkgdesc="Bioinformatics taxonomic classifier for microbial classification" +pkgrel=2 +pkgdesc="Rapid and memory-efficient tool for classification of metagenomic sequences" arch=('x86_64') url=https://ccb.jhu.edu/software/centrifuge/ license=('GPL3') -depends=('libpthread-stubs' 'zlib') -source=("${pkgname}"-"${pkgver}".tar.gz::https://github.com/infphilo/"${pkgname}"/archive/v"${pkgver}"-beta.tar.gz) -sha256sums=('64eb3aa3461d27462357811832f39a8f85702eb536482f1e67344761ad8ca757') +depends=( + 'glibc' + 'gcc-libs' + 'python' + 'perl' + 'bash' + 'jellyfish' + 'hisat2' + ) +makedepends=('git' 'inetutils' 'pandoc-cli') +source=("${pkgname}-${pkgver}.tar.gz::https://github.com/DaehwanKimLab/centrifuge/archive/refs/tags/v${pkgver}.tar.gz" + 'alphabet.cpp.patch' + 'alphabet.h.patch' + 'centrifuge-build.patch' + 'centrifuge_evaluate_mason.py.patch' + 'centrifuge_evaluate.py.patch' + 'centrifuge-inspect.patch' + 'centrifuge_simulate_reads.py.patch' + 'Makefile.patch' + ) +sha256sums=('929daed0f84739f7636cc1ea2757527e83373f107107ffeb5937a403ba5201bc' + 'ae9334ddd0cb9b09811969c151350c4e5ee73452fe5d00f01fd7cf23d6573d78' + 'f681bd2fd89429245ff092b689d53c7e119a5f1270a414b8629896fd5ced90f5' + '12be02a7c3b63679703874b45b6491c81402bac7159859936fe4c78a47632ded' + 'ee6e4c8d8b3d12c141bfeff1aa8c7afd027259d8aa0d1380f6854d4619d868ad' + 'ad05064999c0da91866c82063e42ef60107142acc878fde0342bd629d26d3e68' + 'bf19a2b9e4ef745227eb593d93543020d31726e38d000c6a0091aeb0c3931ffc' + 'f4268c3aa84fc1e14281470251217c1a4a36317c6ad93fc772ea4a46218a3a67' + 'e523ba32ad8163db09fd516a51756f726434dbbe2a2dd9697a5e6a6d606097c1') + +prepare() { + cp *.py.patch ${pkgname}-${pkgver}/evaluation + cp *.patch ${pkgname}-${pkgver}/ + cp ${pkgname}-${pkgver}/evaluation/centrifuge_evaluate_mason.py.patch ${pkgname}-${pkgver}/evaluation/test + cd "${pkgname}-${pkgver}" + # patch Makefile to be inline with archlinux build guidelines + patch -p1 < Makefile.patch + # patch for arm compatibility (not tested fully) + patch -p1 < alphabet.cpp.patch + patch -p1 < alphabet.h.patch + # patch scripts for python3 compatibility + patch -p1 < centrifuge-build.patch + patch -p1 < centrifuge-inspect.patch + cd evaluation + patch -p1 < centrifuge_evaluate.py.patch + patch -p1 < centrifuge_simulate_reads.py.patch + cd test + patch -p1 < centrifuge_evaluate_mason.py.patch +} build() { - cd "${srcdir}"/"${pkgname}"-"${pkgver}"-beta - make + cd "${pkgname}-${pkgver}" + make prefix="/usr" } package() { - cd "${srcdir}"/"${pkgname}"-"${pkgver}"-beta - make install prefix="${pkgdir}"/usr/bin - - install -Dm644 MANUAL "${pkgdir}"/usr/share/doc/"${pkgname}"/MANUAL - install -Dm644 MANUAL.markdown "${pkgdir}"/usr/share/doc/"${pkgname}"/MANUAL.markdown - install -Dm644 AUTHORS "${pkgdir}"/usr/share/doc/"${pkgname}"/AUTHORS - install -Dm644 NEWS "${pkgdir}"/usr/share/doc/"${pkgname}"/NEWS + cd "${pkgname}-${pkgver}" + make DESTDIR=${pkgdir} install } diff --git a/alphabet.cpp.patch b/alphabet.cpp.patch new file mode 100644 index 000000000000..40d3e02efa36 --- /dev/null +++ b/alphabet.cpp.patch @@ -0,0 +1,11 @@ +--- a/alphabet.cpp 2023-04-14 21:35:23.646820868 +0530 ++++ b/alphabet.cpp 2023-04-14 22:09:01.836357064 +0530 +@@ -400,7 +400,7 @@ + + const char *iupacs = "!ACMGRSVTWYHKDBN!acmgrsvtwyhkdbn"; + +-char mask2iupac[16] = { ++signed char mask2iupac[16] = { + -1, + 'A', // 0001 + 'C', // 0010 diff --git a/alphabet.h.patch b/alphabet.h.patch new file mode 100644 index 000000000000..ecdb02621507 --- /dev/null +++ b/alphabet.h.patch @@ -0,0 +1,11 @@ +--- a/alphabet.h 2023-04-14 21:35:23.700151724 +0530 ++++ b/alphabet.h 2023-04-14 22:09:01.839690247 +0530 +@@ -65,7 +65,7 @@ + /// corresponding 2-bit nucleotide + extern uint8_t nuccol2nuc[5][5]; + /// Convert a 4-bit mask into an IUPAC code +-extern char mask2iupac[16]; ++extern signed char mask2iupac[16]; + + /// Convert an ascii color to an ascii dna char + extern char col2dna[]; diff --git a/centrifuge-build.patch b/centrifuge-build.patch new file mode 100644 index 000000000000..8e5abc0daa5c --- /dev/null +++ b/centrifuge-build.patch @@ -0,0 +1,8 @@ +--- a/centrifuge-build 2023-04-14 21:25:00.850209371 +0530 ++++ b/centrifuge-build 2023-04-14 21:47:01.335920008 +0530 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/python3 + + """ + Copyright 2014, Daehwan Kim <infphilo@gmail.com> diff --git a/centrifuge-inspect.patch b/centrifuge-inspect.patch new file mode 100644 index 000000000000..7661396c194b --- /dev/null +++ b/centrifuge-inspect.patch @@ -0,0 +1,8 @@ +--- a/centrifuge-inspect 2023-04-14 21:25:52.644760023 +0530 ++++ b/centrifuge-inspect 2023-04-14 21:47:08.818932252 +0530 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/python3 + + """ + Copyright 2014, Daehwan Kim <infphilo@gmail.com> diff --git a/centrifuge_evaluate.py.patch b/centrifuge_evaluate.py.patch new file mode 100644 index 000000000000..13e501d02263 --- /dev/null +++ b/centrifuge_evaluate.py.patch @@ -0,0 +1,135 @@ +--- a/centrifuge_evaluate.py 2023-04-14 21:26:44.029327465 +0530 ++++ b/centrifuge_evaluate.py 2023-04-14 21:53:50.941828413 +0530 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/python3 + + import sys, os, subprocess, inspect + import platform, multiprocessing +@@ -25,7 +25,7 @@ + """ + def compare_scm(centrifuge_out, true_out, taxonomy_tree, rank): + ancestors = set() +- for tax_id in taxonomy_tree.keys(): ++ for tax_id in list(taxonomy_tree.keys()): + if tax_id in ancestors: + continue + while True: +@@ -106,7 +106,7 @@ + unclassified += 1 + + raw_unique_classified = 0 +- for value in db_dic.values(): ++ for value in list(db_dic.values()): + if len(value) == 1: + raw_unique_classified += 1 + return classified, unique_classified, unclassified, len(db_dic), raw_unique_classified +@@ -152,7 +152,7 @@ + if tax_id in db_dic: + SSR += (abundance - db_dic[tax_id]) ** 2; + if debug: +- print >> sys.stderr, "\t\t\t\t{:<10}: {:.6} vs. {:.6} (truth vs. centrifuge)".format(tax_id, abundance, db_dic[tax_id]) ++ print("\t\t\t\t{:<10}: {:.6} vs. {:.6} (truth vs. centrifuge)".format(tax_id, abundance, db_dic[tax_id]), file=sys.stderr) + else: + SSR += (abundance) ** 2 + +@@ -179,7 +179,7 @@ + """ + def create_sql_db(sql_db): + if os.path.exists(sql_db): +- print >> sys.stderr, sql_db, "already exists!" ++ print(sql_db, "already exists!", file=sys.stderr) + return + + columns = [ +@@ -316,7 +316,7 @@ + os.mkdir(index_path) + index_fnames = ["%s/%s.%d.cf" % (index_path, index_base, i+1) for i in range(3)] + if not check_files(index_fnames): +- print >> sys.stderr, "Downloading indexes: %s" % ("index") ++ print("Downloading indexes: %s" % ("index"), file=sys.stderr) + os.system("cd %s; wget ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz; ln -s %s/%s* .; cd -" % \ + (index_path, index_base, index_base, index_base, index_base, index_base)) + assert check_files(index_fnames) +@@ -356,7 +356,7 @@ + scm_fname = "%s/%s.scm" % (read_path, read_base) + read_fnames = [read1_fname, read2_fname, truth_fname, scm_fname] + if not check_files(read_fnames): +- print >> sys.stderr, "Simulating reads %s_1.fq %s_2.fq ..." % (read_base, read_base) ++ print("Simulating reads %s_1.fq %s_2.fq ..." % (read_base, read_base), file=sys.stderr) + centrifuge_simulate = os.path.join(path_base, "centrifuge_simulate_reads.py") + simulate_cmd = [centrifuge_simulate, + "--num-fragment", str(num_fragment)] +@@ -377,11 +377,11 @@ + else: + base_fname = read_base + "_single" + +- print >> sys.stderr, "Database: %s" % (index_base) ++ print("Database: %s" % (index_base), file=sys.stderr) + if paired: +- print >> sys.stderr, "\t%d million pairs" % (num_fragment / 1000000) ++ print("\t%d million pairs" % (num_fragment / 1000000), file=sys.stderr) + else: +- print >> sys.stderr, "\t%d million reads" % (num_fragment / 1000000) ++ print("\t%d million reads" % (num_fragment / 1000000), file=sys.stderr) + + program_bin_base = "%s/.." % path_base + def get_program_version(program, version): +@@ -428,7 +428,7 @@ + if version: + program_name += ("_%s" % version) + +- print >> sys.stderr, "\t%s\t%s" % (program_name, str(datetime.now())) ++ print("\t%s\t%s" % (program_name, str(datetime.now())), file=sys.stderr) + if paired: + program_dir = program_name + "_paired" + else: +@@ -449,7 +449,7 @@ + program_cmd = get_program_cmd(program, version, read1_fname, read2_fname, out_fname) + start_time = datetime.now() + if verbose: +- print >> sys.stderr, "\t", start_time, " ".join(program_cmd) ++ print("\t", start_time, " ".join(program_cmd), file=sys.stderr) + if program in ["centrifuge"]: + proc = subprocess.Popen(program_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) + else: +@@ -462,7 +462,7 @@ + if duration < 0.1: + duration = 0.1 + if verbose: +- print >> sys.stderr, "\t", finish_time, "finished:", duration ++ print("\t", finish_time, "finished:", duration, file=sys.stderr) + + results = {"strain" : [0, 0, 0], + "species" : [0, 0, 0], +@@ -484,21 +484,21 @@ + # if rank == "strain": + # assert num_cases == num_fragment + +- print >> sys.stderr, "\t\t%s" % rank +- print >> sys.stderr, "\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases) +- print >> sys.stderr, "\t\t\tprecision : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified) +- print >> sys.stderr, "\n\t\t\tfor uniquely classified ", ++ print("\t\t%s" % rank, file=sys.stderr) ++ print("\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases), file=sys.stderr) ++ print("\t\t\tprecision : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified), file=sys.stderr) ++ print("\n\t\t\tfor uniquely classified ", end=' ', file=sys.stderr) + if paired: +- print >> sys.stderr, "pairs" ++ print("pairs", file=sys.stderr) + else: +- print >> sys.stderr, "reads" +- print >> sys.stderr, "\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases) +- print >> sys.stderr, "\t\t\t\t\tprecision : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified) ++ print("reads", file=sys.stderr) ++ print("\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases), file=sys.stderr) ++ print("\t\t\t\t\tprecision : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified), file=sys.stderr) + + # Calculate sum of squared residuals in abundance + if rank == "strain": + abundance_SSR = compare_abundance("centrifuge_report.tsv", truth_fname, taxonomy_tree, debug) +- print >> sys.stderr, "\t\t\tsum of squared residuals in abundance: {}".format(abundance_SSR) ++ print("\t\t\tsum of squared residuals in abundance: {}".format(abundance_SSR), file=sys.stderr) + + if runtime_only: + os.chdir("..") diff --git a/centrifuge_evaluate_mason.py.patch b/centrifuge_evaluate_mason.py.patch new file mode 100644 index 000000000000..88c309bf795a --- /dev/null +++ b/centrifuge_evaluate_mason.py.patch @@ -0,0 +1,127 @@ +--- a/centrifuge_evaluate_mason.py 2023-04-14 21:29:29.482568396 +0530 ++++ b/centrifuge_evaluate_mason.py 2023-04-14 22:05:44.988504275 +0530 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/python3 + + import sys, os, subprocess, inspect + import platform, multiprocessing +@@ -27,7 +27,7 @@ + higher_ranked = {} + + ancestors = set() +- for tax_id in taxonomy_tree.keys(): ++ for tax_id in list(taxonomy_tree.keys()): + if tax_id in ancestors: + continue + while True: +@@ -82,7 +82,7 @@ + + fields = line.strip().split('\t') + if len(fields) != 3: +- print >> sys.stderr, "Warning: %s missing" % (line.strip()) ++ print("Warning: %s missing" % (line.strip()), file=sys.stderr) + continue + read_name, tax_id = fields[1:3] + # Traverse up taxonomy tree to match the given rank parameter +@@ -117,7 +117,7 @@ + # print read_name + + raw_unique_classified = 0 +- for read_name, maps in db_dic.items(): ++ for read_name, maps in list(db_dic.items()): + if len(maps) == 1 and read_name not in higher_ranked: + raw_unique_classified += 1 + return classified, unique_classified, unclassified, len(db_dic), raw_unique_classified +@@ -184,7 +184,7 @@ + read_fname] + + if verbose: +- print >> sys.stderr, ' '.join(centrifuge_cmd) ++ print(' '.join(centrifuge_cmd), file=sys.stderr) + + out_fname = "centrifuge.output" + proc = subprocess.Popen(centrifuge_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) +@@ -208,12 +208,12 @@ + # if rank == "strain": + # assert num_cases == num_fragment + +- print >> sys.stderr, "\t\t%s" % rank +- print >> sys.stderr, "\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases) +- print >> sys.stderr, "\t\t\tprecision : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified) +- print >> sys.stderr, "\n\t\t\tfor uniquely classified " +- print >> sys.stderr, "\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases) +- print >> sys.stderr, "\t\t\t\t\tprecision : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified) ++ print("\t\t%s" % rank, file=sys.stderr) ++ print("\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases), file=sys.stderr) ++ print("\t\t\tprecision : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified), file=sys.stderr) ++ print("\n\t\t\tfor uniquely classified ", file=sys.stderr) ++ print("\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases), file=sys.stderr) ++ print("\t\t\t\t\tprecision : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified), file=sys.stderr) + + # Calculate sum of squared residuals in abundance + """ +@@ -252,12 +252,12 @@ + if rank_taxID not in true_abundance: + true_abundance[rank_taxID] = 0.0 + true_abundance[rank_taxID] += (reads / float(genomeSize)) +- for taxID, reads in true_abundance.items(): ++ for taxID, reads in list(true_abundance.items()): + true_abundance[taxID] /= total_sum + +- print >> sys.stderr, "number of genomes:", num_genomes +- print >> sys.stderr, "number of species:", num_species +- print >> sys.stderr, "number of uniq species:", len(true_abundance) ++ print("number of genomes:", num_genomes, file=sys.stderr) ++ print("number of species:", num_species, file=sys.stderr) ++ print("number of uniq species:", len(true_abundance), file=sys.stderr) + + read_fname = "centrifuge_data/bacteria_sim10M/bacteria_sim10M.fa" + summary_fname = "centrifuge.summary" +@@ -271,14 +271,14 @@ + read_fname] + + if verbose: +- print >> sys.stderr, ' '.join(centrifuge_cmd) ++ print(' '.join(centrifuge_cmd), file=sys.stderr) + + out_fname = "centrifuge.output" + proc = subprocess.Popen(centrifuge_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) + proc.communicate() + + calc_abundance = {} +- for taxID in true_abundance.keys(): ++ for taxID in list(true_abundance.keys()): + calc_abundance[taxID] = 0.0 + first = True + for line in open(summary_fname): +@@ -296,12 +296,12 @@ + """ + + abundance_file = open("abundance.cmp", 'w') +- print >> abundance_file, "taxID\ttrue\tcalc\trank" ++ print("taxID\ttrue\tcalc\trank", file=abundance_file) + for rank in ranks: + if rank == "strain": + continue + true_abundance_rank, calc_abundance_rank = {}, {} +- for taxID in true_abundance.keys(): ++ for taxID in list(true_abundance.keys()): + assert taxID in calc_abundance + rank_taxID = taxID + while True: +@@ -322,11 +322,11 @@ + calc_abundance_rank[rank_taxID] += calc_abundance[taxID] + + ssr = 0.0 # Sum of Squared Residuals +- for taxID in true_abundance_rank.keys(): ++ for taxID in list(true_abundance_rank.keys()): + assert taxID in calc_abundance_rank + ssr += (true_abundance_rank[taxID] - calc_abundance_rank[taxID]) ** 2 +- print >> abundance_file, "%s\t%.6f\t%.6f\t%s" % (taxID, true_abundance_rank[taxID], calc_abundance_rank[taxID], rank) +- print >> sys.stderr, "%s) Sum of squared residuals: %.6f" % (rank, ssr) ++ print("%s\t%.6f\t%.6f\t%s" % (taxID, true_abundance_rank[taxID], calc_abundance_rank[taxID], rank), file=abundance_file) ++ print("%s) Sum of squared residuals: %.6f" % (rank, ssr), file=sys.stderr) + abundance_file.close() + + diff --git a/centrifuge_simulate_reads.py.patch b/centrifuge_simulate_reads.py.patch new file mode 100644 index 000000000000..c27c431e13b9 --- /dev/null +++ b/centrifuge_simulate_reads.py.patch @@ -0,0 +1,143 @@ +--- a/centrifuge_simulate_reads.py 2023-04-14 21:27:38.630430207 +0530 ++++ b/centrifuge_simulate_reads.py 2023-04-14 22:03:27.790914404 +0530 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/python3 + + # + # Copyright 2015, Daehwan Kim <infphilo@gmail.com> +@@ -156,7 +156,7 @@ + transcripts[transcript_id][2].append([left, right]) + + # Sort exons and merge where separating introns are <=5 bps +- for tran, [chr, strand, exons] in transcripts.items(): ++ for tran, [chr, strand, exons] in list(transcripts.items()): + exons.sort() + tmp_exons = [exons[0]] + for i in range(1, len(exons)): +@@ -167,7 +167,7 @@ + transcripts[tran] = [chr, strand, tmp_exons] + + tmp_transcripts = {} +- for tran, [chr, strand, exons] in transcripts.items(): ++ for tran, [chr, strand, exons] in list(transcripts.items()): + exon_lens = [e[1] - e[0] + 1 for e in exons] + transcript_len = sum(exon_lens) + if transcript_len >= frag_len: +@@ -444,8 +444,8 @@ + MD += ("{}".format(MD_match_len)) + + if len(read_seq) != read_len: +- print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len +- print >> sys.stderr, pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs ++ print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr) ++ print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr) + assert False + + return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq +@@ -575,8 +575,8 @@ + tMD += ("{}".format(match_len)) + + if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM: +- print >> sys.stderr, chr, pos, cigar, MD, XM, NM, Zs +- print >> sys.stderr, tMD, tXM, tNM ++ print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr) ++ print(tMD, tXM, tNM, file=sys.stderr) + assert False + + +@@ -631,7 +631,7 @@ + # Read genome sequences into memory + genomes_fname = index_fname + ".fa" + if not os.path.exists(genomes_fname): +- print >> sys.stderr, "Extracting genomes from Centrifuge index to %s, which may take a few hours ..." % (genomes_fname) ++ print("Extracting genomes from Centrifuge index to %s, which may take a few hours ..." % (genomes_fname), file=sys.stderr) + extract_cmd = [centrifuge_inspect, + index_fname] + extract_proc = subprocess.Popen(extract_cmd, stdout=open(genomes_fname, 'w')) +@@ -660,15 +660,15 @@ + assert num_frag == sum(expr_profile) + + if dna: +- genome_ids = genome_seqs.keys() ++ genome_ids = list(genome_seqs.keys()) + else: +- transcript_ids = transcripts.keys() ++ transcript_ids = list(transcripts.keys()) + random.shuffle(transcript_ids) + assert len(transcript_ids) >= len(expr_profile) + + # Truth table + truth_file = open(base_fname + ".truth", "w") +- print >> truth_file, "taxID\tgenomeLen\tnumReads\tabundance\tname" ++ print("taxID\tgenomeLen\tnumReads\tabundance\tname", file=truth_file) + truth_list = [] + normalized_sum = 0.0 + debug_num_frag = 0 +@@ -695,19 +695,19 @@ + if can_tax_id in names: + name = names[can_tax_id] + abundance = raw_abundance / genome_len / normalized_sum +- print >> truth_file, "{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name) ++ print("{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name), file=truth_file) + truth_file.close() + + # Sequence Classification Map (SCM) - something I made up ;-) + scm_file = open(base_fname + ".scm", "w") + + # Write SCM header +- print >> scm_file, "@HD\tVN:1.0\tSO:unsorted" +- for tax_id in genome_seqs.keys(): ++ print("@HD\tVN:1.0\tSO:unsorted", file=scm_file) ++ for tax_id in list(genome_seqs.keys()): + name = "" + if tax_id in names: + name = names[tax_id] +- print >> scm_file, "@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id])) ++ print("@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id])), file=scm_file) + + read_file = open(base_fname + "_1.fa", "w") + if paired_end: +@@ -718,11 +718,11 @@ + t_num_frags = expr_profile[t] + if dna: + tax_id = genome_ids[t] +- print >> sys.stderr, "TaxID: %s, num fragments: %d" % (tax_id, t_num_frags) ++ print("TaxID: %s, num fragments: %d" % (tax_id, t_num_frags), file=sys.stderr) + else: + transcript_id = transcript_ids[t] + chr, strand, transcript_len, exons = transcripts[transcript_id] +- print >> sys.stderr, transcript_id, t_num_frags ++ print(transcript_id, t_num_frags, file=sys.stderr) + + genome_seq = genome_seqs[tax_id] + genome_len = len(genome_seq) +@@ -763,14 +763,14 @@ + XS = "\tXS:A:{}".format(strand) + TI = "\tTI:Z:{}".format(transcript_id) + +- print >> read_file, ">{}".format(cur_read_id) +- print >> read_file, read_seq ++ print(">{}".format(cur_read_id), file=read_file) ++ print(read_seq, file=read_file) + output = "{}\t{}\t{}\t{}\tNM:i:{}\tMD:Z:{}".format(cur_read_id, tax_id, pos + 1, cigar_str, NM, MD) + if paired_end: +- print >> read2_file, ">{}".format(cur_read_id) +- print >> read2_file, reverse_complement(read2_seq) ++ print(">{}".format(cur_read_id), file=read2_file) ++ print(reverse_complement(read2_seq), file=read2_file) + output += "\t{}\t{}\tNM2:i:{}\tMD2:Z:{}".format(pos2 + 1, cigar2_str, NM2, MD2) +- print >> scm_file, output ++ print(output, file=scm_file) + + cur_read_id += 1 + +@@ -865,7 +865,7 @@ + parser.print_help() + exit(1) + if not args.dna: +- print >> sys.stderr, "Error: --rna is not implemented." ++ print("Error: --rna is not implemented.", file=sys.stderr) + exit(1) + # if args.dna: + # args.expr_profile = "constant" |