diff --git a/ingest/bin/gene-coverage.py b/ingest/bin/gene-coverage.py index c7e77d8..dec3d89 100644 --- a/ingest/bin/gene-coverage.py +++ b/ingest/bin/gene-coverage.py @@ -53,7 +53,7 @@ def G_and_F_coverage(alignment, indices_G, indices_F): F_coverage[accession].append(coverages_F.get(accession, 0.00)) genome_coverage[accession].append(coverage_genome.get(accession, 0.00)) - metadata_files = ["data/b/metadata_no_covg.tsv", "data/a/metadata_no_covg.tsv"] + metadata_files = ["data/b/metadata_sorted.tsv", "data/a/metadata_sorted.tsv"] outputs = ["data/b/metadata.tsv", "data/a/metadata.tsv"] # this part of the script reads the already separated a and b metadata and adds the F and G covg values to the correct row based on accession diff --git a/ingest/bin/sort.py b/ingest/bin/sort.py index 33e9c30..1bc1bf1 100644 --- a/ingest/bin/sort.py +++ b/ingest/bin/sort.py @@ -59,8 +59,8 @@ def get_similarity(alignment, reference): elif a_or_b[record.id]=='B': b_sequences.append(record) - SeqIO.write(a_sequences, "data/a/sequences_notdedup.fasta","fasta") - SeqIO.write(b_sequences, "data/b/sequences_notdedup.fasta", "fasta") + SeqIO.write(a_sequences, "data/a/sequences.fasta","fasta") + SeqIO.write(b_sequences, "data/b/sequences.fasta", "fasta") metadata = pd.read_csv("data/metadata.tsv", sep="\t", index_col='accession') original_columns = metadata.columns @@ -71,5 +71,5 @@ def get_similarity(alignment, reference): columns=original_columns) b_metadata = pd.DataFrame(data=metadata.loc[metadata['type']=='B'], columns=original_columns) - a_metadata.to_csv('data/a/metadata_notdedup.tsv', sep="\t") - b_metadata.to_csv('data/b/metadata_notdedup.tsv', sep="\t") + a_metadata.to_csv('data/a/metadata_sorted.tsv', sep="\t") + b_metadata.to_csv('data/b/metadata_sorted.tsv', sep="\t") diff --git a/ingest/workflow/snakemake_rules/sort.smk b/ingest/workflow/snakemake_rules/sort.smk index b246f71..2b73e23 100644 --- a/ingest/workflow/snakemake_rules/sort.smk +++ b/ingest/workflow/snakemake_rules/sort.smk @@ -58,40 +58,22 @@ rule sort: metadata_a = expand("data/a/{time}_metadata.tsv", time=TIME) output: sequences_a = "data/a/sequences.fasta", - metadata_a = "data/a/metadata_notdedup.tsv", + metadata_a = "data/a/metadata_sorted.tsv", sequences_b = "data/b/sequences.fasta", - metadata_b = "data/b/metadata_notdedup.tsv" + metadata_b = "data/b/metadata_sorted.tsv" shell: """ python bin/sort.py """ -rule deduplication: - input: - metadata_a = rules.sort.output.metadata_a, - metadata_b = rules.sort.output.metadata_b - output: - dedup_metadata_a = "data/a/metadata_no_covg.tsv", - dedup_metadata_b = "data/b/metadata_no_covg.tsv" - shell: - """ - python bin/metadata_dedup.py \ - --metadata-original {input.metadata_a} \ - --metadata-output {output.dedup_metadata_a} - - python bin/metadata_dedup.py \ - --metadata-original {input.metadata_b} \ - --metadata-output {output.dedup_metadata_b} - """ - rule coverage: input: alignment_a = expand("data/a/{time}_sequences.aligned.fasta", time=TIME), alignment_b = expand("data/b/{time}_sequences.aligned.fasta", time=TIME), metadata_b = expand("data/b/{time}_metadata.tsv", time=TIME), metadata_a = expand("data/a/{time}_metadata.tsv", time=TIME), - dedup_metadata_a = rules.deduplication.output.dedup_metadata_a, - dedup_metadata_b = rules.deduplication.output.dedup_metadata_b + sorted_metadata_a = "data/a/metadata_sorted.tsv", + sorted_metadata_b = "data/b/metadata_sorted.tsv" output: metadata_a = "data/a/metadata.tsv", metadata_b = "data/b/metadata.tsv"