diff --git a/tools/tax/CMakeLists.txt b/tools/tax/CMakeLists.txt index 03068051..a6a15785 100644 --- a/tools/tax/CMakeLists.txt +++ b/tools/tax/CMakeLists.txt @@ -16,6 +16,10 @@ add_executable(aligns_to src/aligns_to.cpp) target_link_libraries(aligns_to PRIVATE ReaderLib ${SYS_LIBRARIES}) links_and_install_subdir(aligns_to tax) +add_executable(reads_from src/reads_from.cpp) +target_link_libraries(reads_from PRIVATE ReaderLib ${SYS_LIBRARIES}) +links_and_install_subdir(reads_from tax) + add_executable(dump_kmers src/dump_kmers.cpp) target_link_libraries(dump_kmers PRIVATE ReaderLib ${SYS_LIBRARIES}) links_and_install_subdir(dump_kmers tax) diff --git a/tools/tax/src/config_reads_from.hpp b/tools/tax/src/config_reads_from.hpp new file mode 100644 index 00000000..d61a7517 --- /dev/null +++ b/tools/tax/src/config_reads_from.hpp @@ -0,0 +1,140 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include "log.h" +#include "missing_cpp_features.h" + +struct Args +{ + using FileList = std::list ; + FileList files; + std::string spot_filter_file; + + bool unaligned_only = false; + bool hide_counts = false, compact = false; + + int optimization_ultrafast_skip_reader = 0; + + Args(int argc, char const *argv[]) + { + int arg = 1; + auto have_arg = [&]() -> bool { return arg < argc; }; + auto pop_arg = [&]() -> std::string { + if (have_arg()) { + auto const result = argv[arg]; + ++arg; + return std::string(result); + } + fail("need more args"); + }; + std::string file; + auto have_file = false; + + while (have_arg()) { + auto const &arg = pop_arg(); + + if (arg.empty()) { +USAGE_ERROR: + std::string reason = "unexpected argument: \"" + arg + "\""; + fail(reason.c_str()); + } + if (arg.front() == '-') { + if (arg == "-unaligned_only") + unaligned_only = true; + else if (arg == "-spot_filter") + spot_filter_file = pop_arg(); + else if (arg == "-optimization_ultrafast_skip_reader") + optimization_ultrafast_skip_reader = std::stoi(pop_arg()); + else + goto USAGE_ERROR; + } + else if (have_file) + goto USAGE_ERROR; + else { + file = arg; + have_file = true; + } + } + + // exactly one should exist + if (!have_file) + fail("nothing to process"); + + if (ends_with(file, ".list")) + files = load_list(file); + else + files.push_back(file); + + if (files.empty()) + fail("loaded empty list of files to process"); + } + + static std::list load_list(const std::string &filename) + { + std::ifstream f(filename); + if (f.fail()) + throw std::runtime_error(std::string("cannot open list file ") + filename); + + std::list items; + + while (!f.eof()) + { + std::string s; + f >> s; + if (f.fail()) + break; + + items.push_back(s); + } + + return items; + } + + static void fail [[noreturn]] (const char* reason = "invalid arguments") + { + LOG(reason); + print_usage(); + exit(1); + } + + static void print_usage() + { + std::cerr << + "need fasta/accession or file containing list of fasta/accession\n" + "examples:\n" + " reads_from inputs.list # NB. extension is .list\n" + " reads_from chr1.fasta # NB. extension is .fasta, .fa, or .fna\n" + " reads_from SRR000001 # not one of the above\n" + << std::endl; + } +}; diff --git a/tools/tax/src/reads_from.cpp b/tools/tax/src/reads_from.cpp new file mode 100644 index 00000000..818c76ea --- /dev/null +++ b/tools/tax/src/reads_from.cpp @@ -0,0 +1,96 @@ +/*=========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * =========================================================================== + * + */ + +#include "config_reads_from.hpp" + +#include +#include +#include +#include + +const std::string VERSION = "0.1"; + +typedef uint64_t hash_t; + +#include "reader.h" +#include "missing_cpp_features.h" + +using namespace std; +using namespace std::chrono; + +static int process(Args::FileList const &files, Reader::Params const ¶ms) +{ + auto const before = high_resolution_clock::now(); + auto const multifile = files.size() > 1; + + for (auto &file : files) + { + LOG(file); + + try { + auto reader = Reader::create(file, params); + Reader::Fragment frag; + + while (reader->read(&frag)) { + cout << '>' << frag.spotid << '\n' << + frag.bases << '\n'; + } + } + catch (std::exception const &e) + { + LOG(e.what()); + if (!multifile) + throw e; + } + } + + LOG("total time (sec) " << std::chrono::duration_cast( high_resolution_clock::now() - before ).count()); + + return 0; +} + +static int process(Args const &args) +{ + Reader::Params params; + + params.filter_file = args.spot_filter_file; + params.ultrafast_skip_reader = args.optimization_ultrafast_skip_reader; + params.unaligned_only = args.unaligned_only; + + return process(args.files, params); +} + +int main(int argc, char const *argv[]) +{ +#ifdef __GLIBCXX__ + std::set_terminate(__gnu_cxx::__verbose_terminate_handler); +#endif + + LOG("reads_from version " << VERSION); + + return process(Args(argc, argv)); +} +