diff options
Diffstat (limited to 'centrifuge_simulate_reads.py.patch')
-rw-r--r-- | centrifuge_simulate_reads.py.patch | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/centrifuge_simulate_reads.py.patch b/centrifuge_simulate_reads.py.patch new file mode 100644 index 000000000000..c27c431e13b9 --- /dev/null +++ b/centrifuge_simulate_reads.py.patch @@ -0,0 +1,143 @@ +--- a/centrifuge_simulate_reads.py 2023-04-14 21:27:38.630430207 +0530 ++++ b/centrifuge_simulate_reads.py 2023-04-14 22:03:27.790914404 +0530 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/python3 + + # + # Copyright 2015, Daehwan Kim <infphilo@gmail.com> +@@ -156,7 +156,7 @@ + transcripts[transcript_id][2].append([left, right]) + + # Sort exons and merge where separating introns are <=5 bps +- for tran, [chr, strand, exons] in transcripts.items(): ++ for tran, [chr, strand, exons] in list(transcripts.items()): + exons.sort() + tmp_exons = [exons[0]] + for i in range(1, len(exons)): +@@ -167,7 +167,7 @@ + transcripts[tran] = [chr, strand, tmp_exons] + + tmp_transcripts = {} +- for tran, [chr, strand, exons] in transcripts.items(): ++ for tran, [chr, strand, exons] in list(transcripts.items()): + exon_lens = [e[1] - e[0] + 1 for e in exons] + transcript_len = sum(exon_lens) + if transcript_len >= frag_len: +@@ -444,8 +444,8 @@ + MD += ("{}".format(MD_match_len)) + + if len(read_seq) != read_len: +- print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len +- print >> sys.stderr, pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs ++ print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr) ++ print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr) + assert False + + return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq +@@ -575,8 +575,8 @@ + tMD += ("{}".format(match_len)) + + if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM: +- print >> sys.stderr, chr, pos, cigar, MD, XM, NM, Zs +- print >> sys.stderr, tMD, tXM, tNM ++ print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr) ++ print(tMD, tXM, tNM, file=sys.stderr) + assert False + + +@@ -631,7 +631,7 @@ + # Read genome sequences into memory + genomes_fname = index_fname + ".fa" + if not os.path.exists(genomes_fname): +- print >> sys.stderr, "Extracting genomes from Centrifuge index to %s, which may take a few hours ..." % (genomes_fname) ++ print("Extracting genomes from Centrifuge index to %s, which may take a few hours ..." % (genomes_fname), file=sys.stderr) + extract_cmd = [centrifuge_inspect, + index_fname] + extract_proc = subprocess.Popen(extract_cmd, stdout=open(genomes_fname, 'w')) +@@ -660,15 +660,15 @@ + assert num_frag == sum(expr_profile) + + if dna: +- genome_ids = genome_seqs.keys() ++ genome_ids = list(genome_seqs.keys()) + else: +- transcript_ids = transcripts.keys() ++ transcript_ids = list(transcripts.keys()) + random.shuffle(transcript_ids) + assert len(transcript_ids) >= len(expr_profile) + + # Truth table + truth_file = open(base_fname + ".truth", "w") +- print >> truth_file, "taxID\tgenomeLen\tnumReads\tabundance\tname" ++ print("taxID\tgenomeLen\tnumReads\tabundance\tname", file=truth_file) + truth_list = [] + normalized_sum = 0.0 + debug_num_frag = 0 +@@ -695,19 +695,19 @@ + if can_tax_id in names: + name = names[can_tax_id] + abundance = raw_abundance / genome_len / normalized_sum +- print >> truth_file, "{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name) ++ print("{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name), file=truth_file) + truth_file.close() + + # Sequence Classification Map (SCM) - something I made up ;-) + scm_file = open(base_fname + ".scm", "w") + + # Write SCM header +- print >> scm_file, "@HD\tVN:1.0\tSO:unsorted" +- for tax_id in genome_seqs.keys(): ++ print("@HD\tVN:1.0\tSO:unsorted", file=scm_file) ++ for tax_id in list(genome_seqs.keys()): + name = "" + if tax_id in names: + name = names[tax_id] +- print >> scm_file, "@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id])) ++ print("@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id])), file=scm_file) + + read_file = open(base_fname + "_1.fa", "w") + if paired_end: +@@ -718,11 +718,11 @@ + t_num_frags = expr_profile[t] + if dna: + tax_id = genome_ids[t] +- print >> sys.stderr, "TaxID: %s, num fragments: %d" % (tax_id, t_num_frags) ++ print("TaxID: %s, num fragments: %d" % (tax_id, t_num_frags), file=sys.stderr) + else: + transcript_id = transcript_ids[t] + chr, strand, transcript_len, exons = transcripts[transcript_id] +- print >> sys.stderr, transcript_id, t_num_frags ++ print(transcript_id, t_num_frags, file=sys.stderr) + + genome_seq = genome_seqs[tax_id] + genome_len = len(genome_seq) +@@ -763,14 +763,14 @@ + XS = "\tXS:A:{}".format(strand) + TI = "\tTI:Z:{}".format(transcript_id) + +- print >> read_file, ">{}".format(cur_read_id) +- print >> read_file, read_seq ++ print(">{}".format(cur_read_id), file=read_file) ++ print(read_seq, file=read_file) + output = "{}\t{}\t{}\t{}\tNM:i:{}\tMD:Z:{}".format(cur_read_id, tax_id, pos + 1, cigar_str, NM, MD) + if paired_end: +- print >> read2_file, ">{}".format(cur_read_id) +- print >> read2_file, reverse_complement(read2_seq) ++ print(">{}".format(cur_read_id), file=read2_file) ++ print(reverse_complement(read2_seq), file=read2_file) + output += "\t{}\t{}\tNM2:i:{}\tMD2:Z:{}".format(pos2 + 1, cigar2_str, NM2, MD2) +- print >> scm_file, output ++ print(output, file=scm_file) + + cur_read_id += 1 + +@@ -865,7 +865,7 @@ + parser.print_help() + exit(1) + if not args.dna: +- print >> sys.stderr, "Error: --rna is not implemented." ++ print("Error: --rna is not implemented.", file=sys.stderr) + exit(1) + # if args.dna: + # args.expr_profile = "constant" |