10 files changed, 618 insertions, 22 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 8e30aae689ff..66de50072e53 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,14 +1,37 @@
 pkgbase = centrifuge
-	pkgdesc = Bioinformatics taxonomic classifier for microbial classification
+	pkgdesc = Rapid and memory-efficient tool for classification of metagenomic sequences
 	pkgver = 1.0.4
-	pkgrel = 1
+	pkgrel = 2
 	url = https://ccb.jhu.edu/software/centrifuge/
 	arch = x86_64
 	license = GPL3
-	depends = libpthread-stubs
-	depends = zlib
-	source = centrifuge-1.0.4.tar.gz::https://github.com/infphilo/centrifuge/archive/v1.0.4-beta.tar.gz
-	sha256sums = 64eb3aa3461d27462357811832f39a8f85702eb536482f1e67344761ad8ca757
+	makedepends = git
+	makedepends = inetutils
+	makedepends = pandoc-cli
+	depends = glibc
+	depends = gcc-libs
+	depends = python
+	depends = perl
+	depends = bash
+	depends = jellyfish
+	depends = hisat2
+	source = centrifuge-1.0.4.tar.gz::https://github.com/DaehwanKimLab/centrifuge/archive/refs/tags/v1.0.4.tar.gz
+	source = alphabet.cpp.patch
+	source = alphabet.h.patch
+	source = centrifuge-build.patch
+	source = centrifuge_evaluate_mason.py.patch
+	source = centrifuge_evaluate.py.patch
+	source = centrifuge-inspect.patch
+	source = centrifuge_simulate_reads.py.patch
+	source = Makefile.patch
+	sha256sums = 929daed0f84739f7636cc1ea2757527e83373f107107ffeb5937a403ba5201bc
+	sha256sums = ae9334ddd0cb9b09811969c151350c4e5ee73452fe5d00f01fd7cf23d6573d78
+	sha256sums = f681bd2fd89429245ff092b689d53c7e119a5f1270a414b8629896fd5ced90f5
+	sha256sums = 12be02a7c3b63679703874b45b6491c81402bac7159859936fe4c78a47632ded
+	sha256sums = ee6e4c8d8b3d12c141bfeff1aa8c7afd027259d8aa0d1380f6854d4619d868ad
+	sha256sums = ad05064999c0da91866c82063e42ef60107142acc878fde0342bd629d26d3e68
+	sha256sums = bf19a2b9e4ef745227eb593d93543020d31726e38d000c6a0091aeb0c3931ffc
+	sha256sums = f4268c3aa84fc1e14281470251217c1a4a36317c6ad93fc772ea4a46218a3a67
+	sha256sums = e523ba32ad8163db09fd516a51756f726434dbbe2a2dd9697a5e6a6d606097c1
 
 pkgname = centrifuge
-
diff --git a/Makefile.patch b/Makefile.patch
new file mode 100644
index 000000000000..89ab3540e169
--- /dev/null
+++ b/Makefile.patch
@@ -0,0 +1,88 @@
+--- a/Makefile	2021-08-17 00:18:48.000000000 +0530
++++ b/Makefile	2023-04-14 23:02:47.795861027 +0530
+@@ -141,7 +141,7 @@
+ #GIT_VERSION = $(shell command -v git 2>&1 > /dev/null && git describe --long --tags --dirty --always --abbrev=10 || cat VERSION)
+ 
+ # Convert BITS=?? to a -m flag
+-BITS=32
++BITS=
+ ifeq (x86_64,$(shell uname -m))
+ BITS=64
+ endif
+@@ -160,15 +160,15 @@
+ ifeq (64,$(BITS))
+ 	BITS_FLAG = -m64
+ endif
+-SSE_FLAG=-msse2
++SSE_FLAG=
+ 
+ DEBUG_FLAGS    = -O0 -g3 $(BIToS_FLAG) $(SSE_FLAG) -std=c++11
+ DEBUG_DEFS     = -DCOMPILER_OPTIONS="\"$(DEBUG_FLAGS) $(EXTRA_FLAGS)\""
+-RELEASE_FLAGS  = -O3 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3 -std=c++11
++RELEASE_FLAGS  = -O2 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3 -std=c++11
+ RELEASE_DEFS   = -DCOMPILER_OPTIONS="\"$(RELEASE_FLAGS) $(EXTRA_FLAGS)\""
+ NOASSERT_FLAGS = -DNDEBUG
+ FILE_FLAGS     = -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+-CFLAGS         = 
++#CFLAGS         = 
+ #CFLAGS         = -fdiagnostics-color=always
+ 
+ ifeq (1,$(USE_SRA))
+@@ -255,7 +255,8 @@
+ 	 $(CFLAGS) \
+      $(PREF_DEF) \
+      $(MM_DEF) \
+-     $(SHMEM_DEF)
++     $(SHMEM_DEF) \
++	 $(LDFLAGS)
+ 
+ #
+ # centrifuge targets
+@@ -363,11 +364,11 @@
+ 
+ centrifuge-build.bat:
+ 	echo "@echo off" > centrifuge-build.bat
+-	echo "python %~dp0/centrifuge-build %*" >> centrifuge-build.bat
++	echo "python3 %~dp0/centrifuge-build %*" >> centrifuge-build.bat
+ 
+ centrifuge-inspect.bat:
+ 	echo "@echo off" > centrifuge-inspect.bat
+-	echo "python %~dp0/centrifuge-inspect %*" >> centrifuge-inspect.bat
++	echo "python3 %~dp0/centrifuge-build %*" >> centrifuge-inspect.bat
+ 
+ 
+ .PHONY: centrifuge-src
+@@ -409,24 +410,24 @@
+ MANUAL: MANUAL.markdown
+ 	perl doc/strip_markdown.pl < $^ > $@
+ 
+-prefix=/usr/local
++prefix=/usr/
+ 
+ .PHONY: install
+ install: all
+-	mkdir -p $(prefix)/bin
+-	mkdir -p $(prefix)/share/centrifuge/indices
+-	install -m 0644 indices/Makefile $(prefix)/share/centrifuge/indices
+-	install -d -m 0755 $(prefix)/share/centrifuge/doc
+-	install -m 0644 doc/* $(prefix)/share/centrifuge/doc
++	mkdir -p $(DESTDIR)$(prefix)/bin
++	mkdir -p $(DESTDIR)$(prefix)/share/centrifuge/indices
++	install -m 0644 indices/Makefile $(DESTDIR)$(prefix)/share/centrifuge/indices
++	install -d -m 0755 $(DESTDIR)$(prefix)/share/doc/centrifuge
++	install -m 0644 doc/* $(DESTDIR)$(prefix)/share/doc/centrifuge
+ 	for file in $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_SCRIPT_LIST); do \
+-		install -m 0755 $$file $(prefix)/bin ; \
++		install -m 0755 $$file $(DESTDIR)$(prefix)/bin ; \
+ 	done
+ 
+ .PHONY: uninstall
+ uninstall: all
+ 	for file in $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_SCRIPT_LIST); do \
+-		rm -v $(prefix)/bin/$$file ; \
+-		rm -v $(prefix)/share/centrifuge; \
++		rm -v $(DESTDIR)$(prefix)/bin/$$file ; \
++		rm -v $(DESTDIR)$(prefix)/share/centrifuge; \
+ 	done
+ 
+ 
diff --git a/PKGBUILD b/PKGBUILD
index 9b673936dcbe..26f20dce5999 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -1,27 +1,69 @@
-# Maintainer: Clint Valentine <valentine.clint@gmail.com>
+# Maintainer: Bipin Kumar <kbipinkumar@pm.me> 
+# Previous Maintainer: Clint Valentine <valentine.clint@gmail.com>
 
 pkgname=centrifuge
 pkgver=1.0.4
-pkgrel=1
-pkgdesc="Bioinformatics taxonomic classifier for microbial classification"
+pkgrel=2
+pkgdesc="Rapid and memory-efficient tool for classification of metagenomic sequences"
 arch=('x86_64')
 url=https://ccb.jhu.edu/software/centrifuge/
 license=('GPL3')
-depends=('libpthread-stubs' 'zlib')
-source=("${pkgname}"-"${pkgver}".tar.gz::https://github.com/infphilo/"${pkgname}"/archive/v"${pkgver}"-beta.tar.gz)
-sha256sums=('64eb3aa3461d27462357811832f39a8f85702eb536482f1e67344761ad8ca757')
+depends=(
+         'glibc'
+         'gcc-libs'
+         'python'
+         'perl'
+         'bash'
+         'jellyfish'
+         'hisat2'
+        )
+makedepends=('git' 'inetutils' 'pandoc-cli')
+source=("${pkgname}-${pkgver}.tar.gz::https://github.com/DaehwanKimLab/centrifuge/archive/refs/tags/v${pkgver}.tar.gz"
+        'alphabet.cpp.patch'
+        'alphabet.h.patch'
+        'centrifuge-build.patch'
+        'centrifuge_evaluate_mason.py.patch'
+        'centrifuge_evaluate.py.patch'
+        'centrifuge-inspect.patch'
+        'centrifuge_simulate_reads.py.patch'
+        'Makefile.patch'
+        )
+sha256sums=('929daed0f84739f7636cc1ea2757527e83373f107107ffeb5937a403ba5201bc'
+            'ae9334ddd0cb9b09811969c151350c4e5ee73452fe5d00f01fd7cf23d6573d78'
+            'f681bd2fd89429245ff092b689d53c7e119a5f1270a414b8629896fd5ced90f5'
+            '12be02a7c3b63679703874b45b6491c81402bac7159859936fe4c78a47632ded'
+            'ee6e4c8d8b3d12c141bfeff1aa8c7afd027259d8aa0d1380f6854d4619d868ad'
+            'ad05064999c0da91866c82063e42ef60107142acc878fde0342bd629d26d3e68'
+            'bf19a2b9e4ef745227eb593d93543020d31726e38d000c6a0091aeb0c3931ffc'
+            'f4268c3aa84fc1e14281470251217c1a4a36317c6ad93fc772ea4a46218a3a67'
+            'e523ba32ad8163db09fd516a51756f726434dbbe2a2dd9697a5e6a6d606097c1')
+
+prepare() {
+  cp *.py.patch ${pkgname}-${pkgver}/evaluation
+  cp *.patch ${pkgname}-${pkgver}/
+  cp ${pkgname}-${pkgver}/evaluation/centrifuge_evaluate_mason.py.patch ${pkgname}-${pkgver}/evaluation/test
+  cd "${pkgname}-${pkgver}"
+  # patch Makefile to be inline with archlinux build guidelines
+    patch -p1 < Makefile.patch
+  # patch for arm compatibility (not tested fully)
+    patch -p1 < alphabet.cpp.patch
+    patch -p1 < alphabet.h.patch
+  # patch scripts for python3 compatibility
+    patch -p1 < centrifuge-build.patch
+    patch -p1 < centrifuge-inspect.patch
+  cd evaluation
+    patch -p1 < centrifuge_evaluate.py.patch
+    patch -p1 < centrifuge_simulate_reads.py.patch
+  cd test
+    patch -p1 < centrifuge_evaluate_mason.py.patch
+}
 
 build() {
-  cd "${srcdir}"/"${pkgname}"-"${pkgver}"-beta
-  make
+  cd "${pkgname}-${pkgver}"
+  make prefix="/usr"
 }
 
 package() {
-  cd "${srcdir}"/"${pkgname}"-"${pkgver}"-beta
-  make install prefix="${pkgdir}"/usr/bin
-
-  install -Dm644 MANUAL "${pkgdir}"/usr/share/doc/"${pkgname}"/MANUAL
-  install -Dm644 MANUAL.markdown "${pkgdir}"/usr/share/doc/"${pkgname}"/MANUAL.markdown
-  install -Dm644 AUTHORS "${pkgdir}"/usr/share/doc/"${pkgname}"/AUTHORS
-  install -Dm644 NEWS "${pkgdir}"/usr/share/doc/"${pkgname}"/NEWS
+  cd "${pkgname}-${pkgver}"
+  make DESTDIR=${pkgdir} install
 }
diff --git a/alphabet.cpp.patch b/alphabet.cpp.patch
new file mode 100644
index 000000000000..40d3e02efa36
--- /dev/null
+++ b/alphabet.cpp.patch
@@ -0,0 +1,11 @@
+--- a/alphabet.cpp	2023-04-14 21:35:23.646820868 +0530
++++ b/alphabet.cpp	2023-04-14 22:09:01.836357064 +0530
+@@ -400,7 +400,7 @@
+ 
+ const char *iupacs = "!ACMGRSVTWYHKDBN!acmgrsvtwyhkdbn";
+ 
+-char mask2iupac[16] = {
++signed char mask2iupac[16] = {
+ 	-1,
+ 	'A', // 0001
+ 	'C', // 0010
diff --git a/alphabet.h.patch b/alphabet.h.patch
new file mode 100644
index 000000000000..ecdb02621507
--- /dev/null
+++ b/alphabet.h.patch
@@ -0,0 +1,11 @@
+--- a/alphabet.h	2023-04-14 21:35:23.700151724 +0530
++++ b/alphabet.h	2023-04-14 22:09:01.839690247 +0530
+@@ -65,7 +65,7 @@
+ /// corresponding 2-bit nucleotide
+ extern uint8_t nuccol2nuc[5][5];
+ /// Convert a 4-bit mask into an IUPAC code
+-extern char mask2iupac[16];
++extern signed char mask2iupac[16];
+ 
+ /// Convert an ascii color to an ascii dna char
+ extern char col2dna[];
diff --git a/centrifuge-build.patch b/centrifuge-build.patch
new file mode 100644
index 000000000000..8e5abc0daa5c
--- /dev/null
+++ b/centrifuge-build.patch
@@ -0,0 +1,8 @@
+--- a/centrifuge-build	2023-04-14 21:25:00.850209371 +0530
++++ b/centrifuge-build	2023-04-14 21:47:01.335920008 +0530
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python
++#!/usr/bin/python3
+ 
+ """
+  Copyright 2014, Daehwan Kim <infphilo@gmail.com>
diff --git a/centrifuge-inspect.patch b/centrifuge-inspect.patch
new file mode 100644
index 000000000000..7661396c194b
--- /dev/null
+++ b/centrifuge-inspect.patch
@@ -0,0 +1,8 @@
+--- a/centrifuge-inspect	2023-04-14 21:25:52.644760023 +0530
++++ b/centrifuge-inspect	2023-04-14 21:47:08.818932252 +0530
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python
++#!/usr/bin/python3
+ 
+ """
+  Copyright 2014, Daehwan Kim <infphilo@gmail.com>
diff --git a/centrifuge_evaluate.py.patch b/centrifuge_evaluate.py.patch
new file mode 100644
index 000000000000..13e501d02263
--- /dev/null
+++ b/centrifuge_evaluate.py.patch
@@ -0,0 +1,135 @@
+--- a/centrifuge_evaluate.py	2023-04-14 21:26:44.029327465 +0530
++++ b/centrifuge_evaluate.py	2023-04-14 21:53:50.941828413 +0530
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python
++#!/usr/bin/python3
+ 
+ import sys, os, subprocess, inspect
+ import platform, multiprocessing
+@@ -25,7 +25,7 @@
+ """
+ def compare_scm(centrifuge_out, true_out, taxonomy_tree, rank):
+     ancestors = set()
+-    for tax_id in taxonomy_tree.keys():
++    for tax_id in list(taxonomy_tree.keys()):
+         if tax_id in ancestors:
+             continue
+         while True:
+@@ -106,7 +106,7 @@
+             unclassified += 1
+ 
+     raw_unique_classified = 0
+-    for value in db_dic.values():
++    for value in list(db_dic.values()):
+         if len(value) == 1:
+             raw_unique_classified += 1
+     return classified, unique_classified, unclassified, len(db_dic), raw_unique_classified
+@@ -152,7 +152,7 @@
+         if tax_id in db_dic:
+             SSR += (abundance - db_dic[tax_id]) ** 2;
+             if debug:
+-                print >> sys.stderr, "\t\t\t\t{:<10}: {:.6} vs. {:.6} (truth vs. centrifuge)".format(tax_id, abundance, db_dic[tax_id])
++                print("\t\t\t\t{:<10}: {:.6} vs. {:.6} (truth vs. centrifuge)".format(tax_id, abundance, db_dic[tax_id]), file=sys.stderr)
+         else:
+             SSR += (abundance) ** 2
+ 
+@@ -179,7 +179,7 @@
+ """
+ def create_sql_db(sql_db):
+     if os.path.exists(sql_db):
+-        print >> sys.stderr, sql_db, "already exists!"
++        print(sql_db, "already exists!", file=sys.stderr)
+         return
+     
+     columns = [
+@@ -316,7 +316,7 @@
+         os.mkdir(index_path)
+     index_fnames = ["%s/%s.%d.cf" % (index_path, index_base, i+1) for i in range(3)]
+     if not check_files(index_fnames):
+-        print >> sys.stderr, "Downloading indexes: %s" % ("index")
++        print("Downloading indexes: %s" % ("index"), file=sys.stderr)
+         os.system("cd %s; wget ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz; ln -s %s/%s* .; cd -" % \
+                       (index_path, index_base, index_base, index_base, index_base, index_base))
+         assert check_files(index_fnames)        
+@@ -356,7 +356,7 @@
+     scm_fname = "%s/%s.scm" % (read_path, read_base)
+     read_fnames = [read1_fname, read2_fname, truth_fname, scm_fname]
+     if not check_files(read_fnames):
+-        print >> sys.stderr, "Simulating reads %s_1.fq %s_2.fq ..." % (read_base, read_base)
++        print("Simulating reads %s_1.fq %s_2.fq ..." % (read_base, read_base), file=sys.stderr)
+         centrifuge_simulate = os.path.join(path_base, "centrifuge_simulate_reads.py")
+         simulate_cmd = [centrifuge_simulate,
+                         "--num-fragment", str(num_fragment)]
+@@ -377,11 +377,11 @@
+     else:
+         base_fname = read_base + "_single"
+ 
+-    print >> sys.stderr, "Database: %s" % (index_base)
++    print("Database: %s" % (index_base), file=sys.stderr)
+     if paired:
+-        print >> sys.stderr, "\t%d million pairs" % (num_fragment / 1000000)
++        print("\t%d million pairs" % (num_fragment / 1000000), file=sys.stderr)
+     else:
+-        print >> sys.stderr, "\t%d million reads" % (num_fragment / 1000000)
++        print("\t%d million reads" % (num_fragment / 1000000), file=sys.stderr)
+ 
+     program_bin_base = "%s/.." % path_base
+     def get_program_version(program, version):
+@@ -428,7 +428,7 @@
+         if version:
+             program_name += ("_%s" % version)
+ 
+-        print >> sys.stderr, "\t%s\t%s" % (program_name, str(datetime.now()))
++        print("\t%s\t%s" % (program_name, str(datetime.now())), file=sys.stderr)
+         if paired:
+             program_dir = program_name + "_paired"
+         else:
+@@ -449,7 +449,7 @@
+         program_cmd = get_program_cmd(program, version, read1_fname, read2_fname, out_fname)
+         start_time = datetime.now()
+         if verbose:
+-            print >> sys.stderr, "\t", start_time, " ".join(program_cmd)
++            print("\t", start_time, " ".join(program_cmd), file=sys.stderr)
+         if program in ["centrifuge"]:
+             proc = subprocess.Popen(program_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
+         else:
+@@ -462,7 +462,7 @@
+         if duration < 0.1:
+             duration = 0.1
+         if verbose:
+-            print >> sys.stderr, "\t", finish_time, "finished:", duration            
++            print("\t", finish_time, "finished:", duration, file=sys.stderr)            
+ 
+         results = {"strain"  : [0, 0, 0],
+                    "species" : [0, 0, 0],
+@@ -484,21 +484,21 @@
+             # if rank == "strain":
+             #    assert num_cases == num_fragment
+ 
+-            print >> sys.stderr, "\t\t%s" % rank
+-            print >> sys.stderr, "\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases)
+-            print >> sys.stderr, "\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified)
+-            print >> sys.stderr, "\n\t\t\tfor uniquely classified ",
++            print("\t\t%s" % rank, file=sys.stderr)
++            print("\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases), file=sys.stderr)
++            print("\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified), file=sys.stderr)
++            print("\n\t\t\tfor uniquely classified ", end=' ', file=sys.stderr)
+             if paired:
+-                print >> sys.stderr, "pairs"
++                print("pairs", file=sys.stderr)
+             else:
+-                print >> sys.stderr, "reads"
+-            print >> sys.stderr, "\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases)
+-            print >> sys.stderr, "\t\t\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified)
++                print("reads", file=sys.stderr)
++            print("\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases), file=sys.stderr)
++            print("\t\t\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified), file=sys.stderr)
+ 
+             # Calculate sum of squared residuals in abundance
+             if rank == "strain":
+                 abundance_SSR = compare_abundance("centrifuge_report.tsv", truth_fname, taxonomy_tree, debug)
+-                print >> sys.stderr, "\t\t\tsum of squared residuals in abundance: {}".format(abundance_SSR)
++                print("\t\t\tsum of squared residuals in abundance: {}".format(abundance_SSR), file=sys.stderr)
+ 
+         if runtime_only:
+             os.chdir("..")
diff --git a/centrifuge_evaluate_mason.py.patch b/centrifuge_evaluate_mason.py.patch
new file mode 100644
index 000000000000..88c309bf795a
--- /dev/null
+++ b/centrifuge_evaluate_mason.py.patch
@@ -0,0 +1,127 @@
+--- a/centrifuge_evaluate_mason.py	2023-04-14 21:29:29.482568396 +0530
++++ b/centrifuge_evaluate_mason.py	2023-04-14 22:05:44.988504275 +0530
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python
++#!/usr/bin/python3
+ 
+ import sys, os, subprocess, inspect
+ import platform, multiprocessing
+@@ -27,7 +27,7 @@
+     higher_ranked = {}
+         
+     ancestors = set()
+-    for tax_id in taxonomy_tree.keys():
++    for tax_id in list(taxonomy_tree.keys()):
+         if tax_id in ancestors:
+             continue
+         while True:
+@@ -82,7 +82,7 @@
+ 
+         fields = line.strip().split('\t')
+         if len(fields) != 3:
+-            print >> sys.stderr, "Warning: %s missing" % (line.strip())
++            print("Warning: %s missing" % (line.strip()), file=sys.stderr)
+             continue
+         read_name, tax_id = fields[1:3] 
+         # Traverse up taxonomy tree to match the given rank parameter
+@@ -117,7 +117,7 @@
+             # print read_name
+ 
+     raw_unique_classified = 0
+-    for read_name, maps in db_dic.items():
++    for read_name, maps in list(db_dic.items()):
+         if len(maps) == 1 and read_name not in higher_ranked:
+             raw_unique_classified += 1
+     return classified, unique_classified, unclassified, len(db_dic), raw_unique_classified
+@@ -184,7 +184,7 @@
+                       read_fname]
+ 
+     if verbose:
+-        print >> sys.stderr, ' '.join(centrifuge_cmd)
++        print(' '.join(centrifuge_cmd), file=sys.stderr)
+ 
+     out_fname = "centrifuge.output"
+     proc = subprocess.Popen(centrifuge_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
+@@ -208,12 +208,12 @@
+         # if rank == "strain":
+         #    assert num_cases == num_fragment
+ 
+-        print >> sys.stderr, "\t\t%s" % rank
+-        print >> sys.stderr, "\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases)
+-        print >> sys.stderr, "\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified)
+-        print >> sys.stderr, "\n\t\t\tfor uniquely classified "
+-        print >> sys.stderr, "\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases)
+-        print >> sys.stderr, "\t\t\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified)
++        print("\t\t%s" % rank, file=sys.stderr)
++        print("\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases), file=sys.stderr)
++        print("\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified), file=sys.stderr)
++        print("\n\t\t\tfor uniquely classified ", file=sys.stderr)
++        print("\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases), file=sys.stderr)
++        print("\t\t\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified), file=sys.stderr)
+ 
+         # Calculate sum of squared residuals in abundance
+         """
+@@ -252,12 +252,12 @@
+         if rank_taxID not in true_abundance:
+             true_abundance[rank_taxID] = 0.0
+         true_abundance[rank_taxID] += (reads / float(genomeSize))
+-    for taxID, reads in true_abundance.items():
++    for taxID, reads in list(true_abundance.items()):
+         true_abundance[taxID] /= total_sum
+ 
+-    print >> sys.stderr, "number of genomes:", num_genomes
+-    print >> sys.stderr, "number of species:", num_species
+-    print >> sys.stderr, "number of uniq species:", len(true_abundance)
++    print("number of genomes:", num_genomes, file=sys.stderr)
++    print("number of species:", num_species, file=sys.stderr)
++    print("number of uniq species:", len(true_abundance), file=sys.stderr)
+ 
+     read_fname = "centrifuge_data/bacteria_sim10M/bacteria_sim10M.fa"
+     summary_fname = "centrifuge.summary"
+@@ -271,14 +271,14 @@
+                       read_fname]
+ 
+     if verbose:
+-        print >> sys.stderr, ' '.join(centrifuge_cmd)
++        print(' '.join(centrifuge_cmd), file=sys.stderr)
+ 
+     out_fname = "centrifuge.output"
+     proc = subprocess.Popen(centrifuge_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
+     proc.communicate()
+ 
+     calc_abundance = {}
+-    for taxID in true_abundance.keys():
++    for taxID in list(true_abundance.keys()):
+         calc_abundance[taxID] = 0.0
+     first = True
+     for line in open(summary_fname):
+@@ -296,12 +296,12 @@
+         """
+ 
+     abundance_file = open("abundance.cmp", 'w')
+-    print >> abundance_file, "taxID\ttrue\tcalc\trank"
++    print("taxID\ttrue\tcalc\trank", file=abundance_file)
+     for rank in ranks:
+         if rank == "strain":
+             continue
+         true_abundance_rank, calc_abundance_rank = {}, {}
+-        for taxID in true_abundance.keys():
++        for taxID in list(true_abundance.keys()):
+             assert taxID in calc_abundance
+             rank_taxID = taxID
+             while True:
+@@ -322,11 +322,11 @@
+             calc_abundance_rank[rank_taxID] += calc_abundance[taxID]
+ 
+         ssr = 0.0 # Sum of Squared Residuals
+-        for taxID in true_abundance_rank.keys():
++        for taxID in list(true_abundance_rank.keys()):
+             assert taxID in calc_abundance_rank
+             ssr += (true_abundance_rank[taxID] - calc_abundance_rank[taxID]) ** 2
+-            print >> abundance_file, "%s\t%.6f\t%.6f\t%s" % (taxID, true_abundance_rank[taxID], calc_abundance_rank[taxID], rank)
+-        print >> sys.stderr, "%s) Sum of squared residuals: %.6f" % (rank, ssr)
++            print("%s\t%.6f\t%.6f\t%s" % (taxID, true_abundance_rank[taxID], calc_abundance_rank[taxID], rank), file=abundance_file)
++        print("%s) Sum of squared residuals: %.6f" % (rank, ssr), file=sys.stderr)
+     abundance_file.close()
+ 
+ 
diff --git a/centrifuge_simulate_reads.py.patch b/centrifuge_simulate_reads.py.patch
new file mode 100644
index 000000000000..c27c431e13b9
--- /dev/null
+++ b/centrifuge_simulate_reads.py.patch
@@ -0,0 +1,143 @@
+--- a/centrifuge_simulate_reads.py	2023-04-14 21:27:38.630430207 +0530
++++ b/centrifuge_simulate_reads.py	2023-04-14 22:03:27.790914404 +0530
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python
++#!/usr/bin/python3
+ 
+ #
+ # Copyright 2015, Daehwan Kim <infphilo@gmail.com>
+@@ -156,7 +156,7 @@
+             transcripts[transcript_id][2].append([left, right])
+ 
+     # Sort exons and merge where separating introns are <=5 bps
+-    for tran, [chr, strand, exons] in transcripts.items():
++    for tran, [chr, strand, exons] in list(transcripts.items()):
+             exons.sort()
+             tmp_exons = [exons[0]]
+             for i in range(1, len(exons)):
+@@ -167,7 +167,7 @@
+             transcripts[tran] = [chr, strand, tmp_exons]
+ 
+     tmp_transcripts = {}
+-    for tran, [chr, strand, exons] in transcripts.items():
++    for tran, [chr, strand, exons] in list(transcripts.items()):
+         exon_lens = [e[1] - e[0] + 1 for e in exons]
+         transcript_len = sum(exon_lens)
+         if transcript_len >= frag_len:
+@@ -444,8 +444,8 @@
+         MD += ("{}".format(MD_match_len))
+ 
+     if len(read_seq) != read_len:
+-        print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len
+-        print >> sys.stderr, pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs
++        print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr)
++        print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr)
+         assert False
+ 
+     return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq
+@@ -575,8 +575,8 @@
+         tMD += ("{}".format(match_len))
+ 
+     if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM:
+-        print >> sys.stderr, chr, pos, cigar, MD, XM, NM, Zs
+-        print >> sys.stderr, tMD, tXM, tNM
++        print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr)
++        print(tMD, tXM, tNM, file=sys.stderr)
+         assert False
+         
+         
+@@ -631,7 +631,7 @@
+     # Read genome sequences into memory
+     genomes_fname = index_fname + ".fa"
+     if not os.path.exists(genomes_fname):
+-        print >> sys.stderr, "Extracting genomes from Centrifuge index to %s, which may take a few hours ..."  % (genomes_fname)
++        print("Extracting genomes from Centrifuge index to %s, which may take a few hours ..."  % (genomes_fname), file=sys.stderr)
+         extract_cmd = [centrifuge_inspect,
+                        index_fname]
+         extract_proc = subprocess.Popen(extract_cmd, stdout=open(genomes_fname, 'w'))
+@@ -660,15 +660,15 @@
+     assert num_frag == sum(expr_profile)
+ 
+     if dna:
+-        genome_ids = genome_seqs.keys()
++        genome_ids = list(genome_seqs.keys())
+     else:
+-        transcript_ids = transcripts.keys()
++        transcript_ids = list(transcripts.keys())
+         random.shuffle(transcript_ids)
+         assert len(transcript_ids) >= len(expr_profile)
+ 
+     # Truth table
+     truth_file = open(base_fname + ".truth", "w")
+-    print >> truth_file, "taxID\tgenomeLen\tnumReads\tabundance\tname"
++    print("taxID\tgenomeLen\tnumReads\tabundance\tname", file=truth_file)
+     truth_list = []
+     normalized_sum = 0.0
+     debug_num_frag = 0
+@@ -695,19 +695,19 @@
+         if can_tax_id in names:
+             name = names[can_tax_id]
+         abundance = raw_abundance / genome_len / normalized_sum
+-        print >> truth_file, "{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name)
++        print("{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name), file=truth_file)
+     truth_file.close()
+ 
+     # Sequence Classification Map (SCM) - something I made up ;-)
+     scm_file = open(base_fname + ".scm", "w")
+ 
+     # Write SCM header
+-    print >> scm_file, "@HD\tVN:1.0\tSO:unsorted"
+-    for tax_id in genome_seqs.keys():
++    print("@HD\tVN:1.0\tSO:unsorted", file=scm_file)
++    for tax_id in list(genome_seqs.keys()):
+         name = ""
+         if tax_id in names:
+             name = names[tax_id]
+-        print >> scm_file, "@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id]))
++        print("@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id])), file=scm_file)
+ 
+     read_file = open(base_fname + "_1.fa", "w")
+     if paired_end:
+@@ -718,11 +718,11 @@
+         t_num_frags = expr_profile[t]
+         if dna:
+             tax_id = genome_ids[t]
+-            print >> sys.stderr, "TaxID: %s, num fragments: %d" % (tax_id, t_num_frags)
++            print("TaxID: %s, num fragments: %d" % (tax_id, t_num_frags), file=sys.stderr)
+         else:
+             transcript_id = transcript_ids[t]
+             chr, strand, transcript_len, exons = transcripts[transcript_id]
+-            print >> sys.stderr, transcript_id, t_num_frags
++            print(transcript_id, t_num_frags, file=sys.stderr)
+ 
+         genome_seq = genome_seqs[tax_id]
+         genome_len = len(genome_seq)
+@@ -763,14 +763,14 @@
+                 XS = "\tXS:A:{}".format(strand)
+                 TI = "\tTI:Z:{}".format(transcript_id)                
+ 
+-            print >> read_file, ">{}".format(cur_read_id)
+-            print >> read_file, read_seq
++            print(">{}".format(cur_read_id), file=read_file)
++            print(read_seq, file=read_file)
+             output = "{}\t{}\t{}\t{}\tNM:i:{}\tMD:Z:{}".format(cur_read_id, tax_id, pos + 1, cigar_str, NM, MD)
+             if paired_end:
+-                print >> read2_file, ">{}".format(cur_read_id)
+-                print >> read2_file, reverse_complement(read2_seq)
++                print(">{}".format(cur_read_id), file=read2_file)
++                print(reverse_complement(read2_seq), file=read2_file)
+                 output += "\t{}\t{}\tNM2:i:{}\tMD2:Z:{}".format(pos2 + 1, cigar2_str, NM2, MD2)
+-            print >> scm_file, output
++            print(output, file=scm_file)
+                 
+             cur_read_id += 1
+             
+@@ -865,7 +865,7 @@
+         parser.print_help()
+         exit(1)
+     if not args.dna:
+-        print >> sys.stderr, "Error: --rna is not implemented."
++        print("Error: --rna is not implemented.", file=sys.stderr)
+         exit(1)
+     # if args.dna:
+     #    args.expr_profile = "constant"