From 63f42efa512228d30050ecc5683e7f5a86d6624e Mon Sep 17 00:00:00 2001 From: Kenneth Durbrow Date: Mon, 12 Jul 2021 17:08:11 -0400 Subject: [PATCH 1/5] Initial commit --- tools/tax/src/reads_from.cpp | 80 ++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tools/tax/src/reads_from.cpp diff --git a/tools/tax/src/reads_from.cpp b/tools/tax/src/reads_from.cpp new file mode 100644 index 00000000..0b11746d --- /dev/null +++ b/tools/tax/src/reads_from.cpp @@ -0,0 +1,80 @@ +/*=========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * =========================================================================== + * + */ + +#include "config_reads_from.hpp" + +#include +#include +#include +#include + +const std::string VERSION = "0.1"; + +typedef uint64_t hash_t; + +#include "reader.h" +#include "missing_cpp_features.h" + +using namespace std; +using namespace std::chrono; + +int main(int argc, char const *argv[]) +{ +#ifdef __GLIBCXX__ + std::set_terminate(__gnu_cxx::__verbose_terminate_handler); +#endif + + LOG("reads_from version " << VERSION); + Config config(argc, argv); + + auto const before = high_resolution_clock::now(); + + for (auto &contig_file : config.contig_files) + { + LOG(contig_file); + + try { + auto reader = Reader::create(contig_file, config); + Reader::Fragment frag; + + while (reader->read(chunk)) { + cout << '>' << frag.spotid << '\n' << + frag.bases << '\n'; + } + } + catch (std::exception const &e) + { + LOG(e.what()); + if (config.contig_files.size() == 1) + throw e; + } + } + + LOG("total time (sec) " << std::chrono::duration_cast( high_resolution_clock::now() - before ).count()); + + return 0; +} + From 754a6e30305c9c9d9d4b5bbe7c8b9755177acd40 Mon Sep 17 00:00:00 2001 From: Kenneth Durbrow Date: Mon, 12 Jul 2021 17:09:43 -0400 Subject: [PATCH 2/5] Initial commit --- tools/tax/src/config_reads_from.hpp | 130 ++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tools/tax/src/config_reads_from.hpp diff --git a/tools/tax/src/config_reads_from.hpp b/tools/tax/src/config_reads_from.hpp new file mode 100644 index 00000000..cdbd6857 --- /dev/null +++ b/tools/tax/src/config_reads_from.hpp @@ -0,0 +1,130 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include "log.h" +#include "missing_cpp_features.h" + +struct Config +{ + std::list contig_files; + std::string spot_filter_file; + + bool unaligned_only = false; + bool hide_counts = false, compact = false; + + int optimization_ultrafast_skip_reader = 0; + + Config(int argc, char const *argv[]) + { + std::string contig_file; + + std::list args; + for (int i = 1; i < argc; ++i) + args.push_back(std::string(argv[i])); + + while (!args.empty()) { + auto arg = pop_arg(args); + + if (arg == "-unaligned_only") + unaligned_only = true; + else if (arg == "-spot_filter") + spot_filter_file = pop_arg(args); + else if (arg == "-optimization_ultrafast_skip_reader") + optimization_ultrafast_skip_reader = std::stoi(pop_arg(args)); + else if (arg.empty() || arg[0] == '-' || !contig_file.empty()) + { + std::string reason = "unexpected argument: " + arg; + fail(reason.c_str()); + } + else + contig_file = arg; + } + + // exactly one should exist + if (contig_file.empty()) // == contig_files.empty()) + fail("please provide either contig file or list"); + + if (ends_with(contig_file, ".list")) + contig_files = load_list(contig_file); + else + contig_files.push_back(contig_file); + + if (contig_files.empty()) + fail("loaded empty list of files to process"); + } + + static std::list load_list(const std::string &filename) + { + std::ifstream f(filename); + if (f.fail()) + throw std::runtime_error(std::string("cannot open list file ") + filename); + + std::list items; + + while (!f.eof()) + { + std::string s; + f >> s; + if (f.fail()) + break; + + items.push_back(s); + } + + return items; + } + + static void fail(const char* reason = "invalid arguments") + { + print_usage(); + LOG(reason); + exit(1); + } + + static void print_usage() + { + std::cerr << "need " << std::endl; + } + +private: + + static std::string pop_arg(std::list& args) + { + if (args.empty()) + fail("need more args"); + + std::string arg = args.front(); + args.pop_front(); + return arg; + } +}; From aa09cd044aaddb0cf4e0b2521430f96955fe341b Mon Sep 17 00:00:00 2001 From: Kenneth Durbrow Date: Tue, 13 Jul 2021 10:40:25 -0400 Subject: [PATCH 3/5] Added new executable --- tools/tax/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/tax/CMakeLists.txt b/tools/tax/CMakeLists.txt index 03068051..a6a15785 100644 --- a/tools/tax/CMakeLists.txt +++ b/tools/tax/CMakeLists.txt @@ -16,6 +16,10 @@ add_executable(aligns_to src/aligns_to.cpp) target_link_libraries(aligns_to PRIVATE ReaderLib ${SYS_LIBRARIES}) links_and_install_subdir(aligns_to tax) +add_executable(reads_from src/reads_from.cpp) +target_link_libraries(reads_from PRIVATE ReaderLib ${SYS_LIBRARIES}) +links_and_install_subdir(reads_from tax) + add_executable(dump_kmers src/dump_kmers.cpp) target_link_libraries(dump_kmers PRIVATE ReaderLib ${SYS_LIBRARIES}) links_and_install_subdir(dump_kmers tax) From 37dad051d534a4ed704d9ff53790fd45a07ad9a9 Mon Sep 17 00:00:00 2001 From: Kenneth Durbrow Date: Tue, 13 Jul 2021 10:41:04 -0400 Subject: [PATCH 4/5] Refactored; changed usage text --- tools/tax/src/config_reads_from.hpp | 90 ++++++++++++++++++----------- tools/tax/src/reads_from.cpp | 42 +++++++++----- 2 files changed, 85 insertions(+), 47 deletions(-) diff --git a/tools/tax/src/config_reads_from.hpp b/tools/tax/src/config_reads_from.hpp index cdbd6857..69a2af7d 100644 --- a/tools/tax/src/config_reads_from.hpp +++ b/tools/tax/src/config_reads_from.hpp @@ -34,9 +34,10 @@ #include "log.h" #include "missing_cpp_features.h" -struct Config +struct Args { - std::list contig_files; + using FileList = std::list ; + FileList files; std::string spot_filter_file; bool unaligned_only = false; @@ -44,42 +45,57 @@ struct Config int optimization_ultrafast_skip_reader = 0; - Config(int argc, char const *argv[]) + Args(int argc, char const *argv[]) { - std::string contig_file; - - std::list args; - for (int i = 1; i < argc; ++i) - args.push_back(std::string(argv[i])); - - while (!args.empty()) { - auto arg = pop_arg(args); - - if (arg == "-unaligned_only") - unaligned_only = true; - else if (arg == "-spot_filter") - spot_filter_file = pop_arg(args); - else if (arg == "-optimization_ultrafast_skip_reader") - optimization_ultrafast_skip_reader = std::stoi(pop_arg(args)); - else if (arg.empty() || arg[0] == '-' || !contig_file.empty()) - { - std::string reason = "unexpected argument: " + arg; + int arg = 1; + auto have_arg = [&]() -> bool { return arg < argc; }; + auto pop_arg = [&]() -> std::string { + if (have_arg()) { + auto const result = argv[arg]; + ++arg; + return std::string(result); + } + fail("need more args"); + }; + std::string file; + auto have_file = false; + + while (have_arg()) { + auto const &arg = pop_arg(); + + if (arg.empty()) { +USAGE_ERROR: + std::string reason = "unexpected argument: \"" + arg + "\""; fail(reason.c_str()); } - else - contig_file = arg; + if (arg.front() == '-') { + if (arg == "-unaligned_only") + unaligned_only = true; + else if (arg == "-spot_filter") + spot_filter_file = pop_arg(); + else if (arg == "-optimization_ultrafast_skip_reader") + optimization_ultrafast_skip_reader = std::stoi(pop_arg()); + else + goto USAGE_ERROR; + } + else if (have_file) + goto USAGE_ERROR; + else { + file = arg; + have_file = true; + } } // exactly one should exist - if (contig_file.empty()) // == contig_files.empty()) - fail("please provide either contig file or list"); + if (!have_file) + fail("nothing to process"); - if (ends_with(contig_file, ".list")) - contig_files = load_list(contig_file); + if (ends_with(file, ".list")) + files = load_list(file); else - contig_files.push_back(contig_file); + files.push_back(file); - if (contig_files.empty()) + if (files.empty()) fail("loaded empty list of files to process"); } @@ -104,26 +120,32 @@ struct Config return items; } - static void fail(const char* reason = "invalid arguments") + static void fail [[noreturn]] (const char* reason = "invalid arguments") { - print_usage(); LOG(reason); + print_usage(); exit(1); } static void print_usage() { - std::cerr << "need " << std::endl; + std::cerr << + "need fasta/accession or file containing list of fasta/accession\n" + "examples:\n" + " reads_from inputs.list # NB. extension is .list\n" + " reads_from chr1.fasta # NB. extension is .fasta, .fa, or .fna\n" + " reads_from SRR000001 # not one of the above\n" + << std::endl; } private: - static std::string pop_arg(std::list& args) + static std::string const &pop_arg(std::list& args) { if (args.empty()) fail("need more args"); - std::string arg = args.front(); + std::string const &arg = args.front(); args.pop_front(); return arg; } diff --git a/tools/tax/src/reads_from.cpp b/tools/tax/src/reads_from.cpp index 0b11746d..818c76ea 100644 --- a/tools/tax/src/reads_from.cpp +++ b/tools/tax/src/reads_from.cpp @@ -41,26 +41,20 @@ typedef uint64_t hash_t; using namespace std; using namespace std::chrono; -int main(int argc, char const *argv[]) +static int process(Args::FileList const &files, Reader::Params const ¶ms) { -#ifdef __GLIBCXX__ - std::set_terminate(__gnu_cxx::__verbose_terminate_handler); -#endif - - LOG("reads_from version " << VERSION); - Config config(argc, argv); - auto const before = high_resolution_clock::now(); + auto const multifile = files.size() > 1; - for (auto &contig_file : config.contig_files) + for (auto &file : files) { - LOG(contig_file); + LOG(file); try { - auto reader = Reader::create(contig_file, config); + auto reader = Reader::create(file, params); Reader::Fragment frag; - while (reader->read(chunk)) { + while (reader->read(&frag)) { cout << '>' << frag.spotid << '\n' << frag.bases << '\n'; } @@ -68,7 +62,7 @@ int main(int argc, char const *argv[]) catch (std::exception const &e) { LOG(e.what()); - if (config.contig_files.size() == 1) + if (!multifile) throw e; } } @@ -78,3 +72,25 @@ int main(int argc, char const *argv[]) return 0; } +static int process(Args const &args) +{ + Reader::Params params; + + params.filter_file = args.spot_filter_file; + params.ultrafast_skip_reader = args.optimization_ultrafast_skip_reader; + params.unaligned_only = args.unaligned_only; + + return process(args.files, params); +} + +int main(int argc, char const *argv[]) +{ +#ifdef __GLIBCXX__ + std::set_terminate(__gnu_cxx::__verbose_terminate_handler); +#endif + + LOG("reads_from version " << VERSION); + + return process(Args(argc, argv)); +} + From 1732884d2cff9301634129f3d6eb7d5fc3d635bf Mon Sep 17 00:00:00 2001 From: Kenneth Durbrow Date: Tue, 13 Jul 2021 10:57:29 -0400 Subject: [PATCH 5/5] Removed unused function --- tools/tax/src/config_reads_from.hpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tools/tax/src/config_reads_from.hpp b/tools/tax/src/config_reads_from.hpp index 69a2af7d..d61a7517 100644 --- a/tools/tax/src/config_reads_from.hpp +++ b/tools/tax/src/config_reads_from.hpp @@ -137,16 +137,4 @@ struct Args " reads_from SRR000001 # not one of the above\n" << std::endl; } - -private: - - static std::string const &pop_arg(std::list& args) - { - if (args.empty()) - fail("need more args"); - - std::string const &arg = args.front(); - args.pop_front(); - return arg; - } };