Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new reads_from tool #15

Open
wants to merge 5 commits into
base: tax
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tools/tax/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ add_executable(aligns_to src/aligns_to.cpp)
target_link_libraries(aligns_to PRIVATE ReaderLib ${SYS_LIBRARIES})
links_and_install_subdir(aligns_to tax)

add_executable(reads_from src/reads_from.cpp)
target_link_libraries(reads_from PRIVATE ReaderLib ${SYS_LIBRARIES})
links_and_install_subdir(reads_from tax)

add_executable(dump_kmers src/dump_kmers.cpp)
target_link_libraries(dump_kmers PRIVATE ReaderLib ${SYS_LIBRARIES})
links_and_install_subdir(dump_kmers tax)
Expand Down
140 changes: 140 additions & 0 deletions tools/tax/src/config_reads_from.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*===========================================================================
*
* PUBLIC DOMAIN NOTICE
* National Center for Biotechnology Information
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
* the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
*
* Although all reasonable efforts have been taken to ensure the accuracy
* and reliability of the software and data, the NLM and the U.S.
* Government do not and cannot warrant the performance or results that
* may be obtained by using this software or data. The NLM and the U.S.
* Government disclaim all warranties, express or implied, including
* warranties of performance, merchantability or fitness for any particular
* purpose.
*
* Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

#pragma once

#include <string>
#include <iostream>
#include <fstream>
#include <list>
#include <stdexcept>
#include "log.h"
#include "missing_cpp_features.h"

struct Args
{
using FileList = std::list <std::string>;
FileList files;
std::string spot_filter_file;

bool unaligned_only = false;
bool hide_counts = false, compact = false;

int optimization_ultrafast_skip_reader = 0;

Args(int argc, char const *argv[])
{
int arg = 1;
auto have_arg = [&]() -> bool { return arg < argc; };
auto pop_arg = [&]() -> std::string {
if (have_arg()) {
auto const result = argv[arg];
++arg;
return std::string(result);
}
fail("need more args");
};
std::string file;
auto have_file = false;

while (have_arg()) {
auto const &arg = pop_arg();

if (arg.empty()) {
USAGE_ERROR:
std::string reason = "unexpected argument: \"" + arg + "\"";
fail(reason.c_str());
}
if (arg.front() == '-') {
if (arg == "-unaligned_only")
unaligned_only = true;
else if (arg == "-spot_filter")
spot_filter_file = pop_arg();
else if (arg == "-optimization_ultrafast_skip_reader")
optimization_ultrafast_skip_reader = std::stoi(pop_arg());
else
goto USAGE_ERROR;
}
else if (have_file)
goto USAGE_ERROR;
else {
file = arg;
have_file = true;
}
}

// exactly one should exist
if (!have_file)
fail("nothing to process");

if (ends_with(file, ".list"))
files = load_list(file);
else
files.push_back(file);

if (files.empty())
fail("loaded empty list of files to process");
}

static std::list<std::string> load_list(const std::string &filename)
{
std::ifstream f(filename);
if (f.fail())
throw std::runtime_error(std::string("cannot open list file ") + filename);

std::list<std::string> items;

while (!f.eof())
{
std::string s;
f >> s;
if (f.fail())
break;

items.push_back(s);
}

return items;
}

static void fail [[noreturn]] (const char* reason = "invalid arguments")
{
LOG(reason);
print_usage();
exit(1);
}

static void print_usage()
{
std::cerr <<
"need fasta/accession or file containing list of fasta/accession\n"
"examples:\n"
" reads_from inputs.list # NB. extension is .list\n"
" reads_from chr1.fasta # NB. extension is .fasta, .fa, or .fna\n"
" reads_from SRR000001 # not one of the above\n"
<< std::endl;
}
};
96 changes: 96 additions & 0 deletions tools/tax/src/reads_from.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*===========================================================================
*
* PUBLIC DOMAIN NOTICE
* National Center for Biotechnology Information
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
* the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
*
* Although all reasonable efforts have been taken to ensure the accuracy
* and reliability of the software and data, the NLM and the U.S.
* Government do not and cannot warrant the performance or results that
* may be obtained by using this software or data. The NLM and the U.S.
* Government disclaim all warranties, express or implied, including
* warranties of performance, merchantability or fitness for any particular
* purpose.
*
* Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

#include "config_reads_from.hpp"

#include <iostream>
#include <chrono>
#include <thread>
#include <list>

const std::string VERSION = "0.1";

typedef uint64_t hash_t;

#include "reader.h"
#include "missing_cpp_features.h"

using namespace std;
using namespace std::chrono;

static int process(Args::FileList const &files, Reader::Params const &params)
{
auto const before = high_resolution_clock::now();
auto const multifile = files.size() > 1;

for (auto &file : files)
{
LOG(file);

try {
auto reader = Reader::create(file, params);
Reader::Fragment frag;

while (reader->read(&frag)) {
cout << '>' << frag.spotid << '\n' <<
frag.bases << '\n';
}
}
catch (std::exception const &e)
{
LOG(e.what());
if (!multifile)
throw e;
}
}

LOG("total time (sec) " << std::chrono::duration_cast<std::chrono::seconds>( high_resolution_clock::now() - before ).count());

return 0;
}

static int process(Args const &args)
{
Reader::Params params;

params.filter_file = args.spot_filter_file;
params.ultrafast_skip_reader = args.optimization_ultrafast_skip_reader;
params.unaligned_only = args.unaligned_only;

return process(args.files, params);
}

int main(int argc, char const *argv[])
{
#ifdef __GLIBCXX__
std::set_terminate(__gnu_cxx::__verbose_terminate_handler);
#endif

LOG("reads_from version " << VERSION);

return process(Args(argc, argv));
}