diff -Nru abyss-2.2.3/ABYSS/abyss.cc abyss-2.2.4/ABYSS/abyss.cc --- abyss-2.2.3/ABYSS/abyss.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/ABYSS/abyss.cc 2020-01-30 18:48:44.000000000 +0000 @@ -1,35 +1,40 @@ +#include "Assembly/Options.h" #if PAIRED_DBG -# include "PairedDBG/SequenceCollection.h" -# include "PairedDBG/PairedDBGAlgorithms.h" +#include "PairedDBG/PairedDBGAlgorithms.h" +#include "PairedDBG/SequenceCollection.h" #else -# include "Assembly/SequenceCollection.h" +#include "Assembly/SequenceCollection.h" #endif #include "Assembly/AssemblyAlgorithms.h" #include "Assembly/DotWriter.h" +#include "DataBase/DB.h" + #include #include // for setvbuf #include #include #include -#include "DataBase/DB.h" using namespace std; DB db; -static void removeLowCoverageContigs(SequenceCollectionHash& g) +static void +removeLowCoverageContigs(SequenceCollectionHash& g) { AssemblyAlgorithms::markAmbiguous(&g); cout << "Removing low-coverage contigs " - "(mean k-mer coverage < " << opt::coverage << ")\n"; + "(mean k-mer coverage < " + << opt::coverage << ")\n"; AssemblyAlgorithms::assemble(&g); AssemblyAlgorithms::splitAmbiguous(&g); opt::coverage = 0; } -static void popBubbles(SequenceCollectionHash& g) +static void +popBubbles(SequenceCollectionHash& g) { cout << "Popping bubbles" << endl; ofstream out; @@ -39,8 +44,8 @@ cout << "Removed " << numPopped << " bubbles\n"; } -static void write_graph(const string& path, - const SequenceCollectionHash& c) +static void +write_graph(const string& path, const SequenceCollectionHash& c) { if (path.empty()) return; @@ -49,16 +54,17 @@ DotWriter::write(out, c); } -static void assemble(const string& pathIn, const string& pathOut) +static void +assemble(const string& pathIn, const string& pathOut) { Timer timer(__func__); SequenceCollectionHash g; if (!pathIn.empty()) AssemblyAlgorithms::loadSequences(&g, pathIn.c_str()); - for_each(opt::inFiles.begin(), opt::inFiles.end(), bind1st( - ptr_fun(AssemblyAlgorithms::loadSequences), - &g)); + for_each(opt::inFiles.begin(), opt::inFiles.end(), [&g](std::string s) { + AssemblyAlgorithms::loadSequences(&g, s); + }); size_t numLoaded = g.size(); if (!opt::db.empty()) addToDb(db, "loadedKmer", numLoaded); @@ -70,16 +76,14 @@ exit(EXIT_FAILURE); } - AssemblyAlgorithms::setCoverageParameters( - AssemblyAlgorithms::coverageHistogram(g)); + AssemblyAlgorithms::setCoverageParameters(AssemblyAlgorithms::coverageHistogram(g)); if (opt::kc > 0) { cout << "Minimum k-mer multiplicity kc is " << opt::kc << endl; cout << "Removing low-multiplicity k-mers" << endl; size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(g, opt::kc); - cout << "Removed " << removed - << " low-multiplicity k-mers, " << g.size() - << " k-mers remaining" << std::endl; + cout << "Removed " << removed << " low-multiplicity k-mers, " << g.size() + << " k-mers remaining" << std::endl; } cout << "Generating adjacency" << endl; @@ -122,13 +126,14 @@ size_t numAssembled = g.size(); size_t numRemoved = numLoaded - numAssembled; - cout << "Removed " << numRemoved << " k-mer.\n" - "The signal-to-noise ratio (SNR) is " - << 10 * log10((double)numAssembled / numRemoved) - << " dB.\n"; + cout << "Removed " << numRemoved + << " k-mer.\n" + "The signal-to-noise ratio (SNR) is " + << 10 * log10((double)numAssembled / numRemoved) << " dB.\n"; } -int main(int argc, char* const* argv) +int +main(int argc, char* const* argv) { Timer timer("Total"); @@ -142,16 +147,16 @@ bool krange = opt::kMin != opt::kMax; if (krange) - cout << "Assembling k=" << opt::kMin << "-" << opt::kMax - << ":" << opt::kStep << endl; + cout << "Assembling k=" << opt::kMin << "-" << opt::kMax << ":" << opt::kStep << endl; if (!opt::db.empty()) { - init(db, - opt::getUvalue(), - opt::getVvalue(), - "ABYSS", - opt::getCommand(), - opt::getMetaValue()); + init( + db, + opt::getUvalue(), + opt::getVvalue(), + "ABYSS", + opt::getCommand(), + opt::getMetaValue()); addToDb(db, "SS", opt::ss); addToDb(db, "k", opt::kmerSize); addToDb(db, "singleK", opt::singleKmerSize); @@ -175,7 +180,7 @@ opt::erodeStrand = (unsigned)-1; opt::coverage = -1; opt::trimLen = k; - opt::bubbleLen = 3*k; + opt::bubbleLen = 3 * k; } ostringstream k0, k1; diff -Nru abyss-2.2.3/Assembly/BranchGroup.h abyss-2.2.4/Assembly/BranchGroup.h --- abyss-2.2.3/Assembly/BranchGroup.h 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Assembly/BranchGroup.h 2020-01-30 18:48:44.000000000 +0000 @@ -19,195 +19,194 @@ /** A container of BranchRecord. */ class BranchGroup { - public: - typedef std::vector BranchGroupData; - typedef BranchGroupData::iterator iterator; - typedef BranchGroupData::const_iterator const_iterator; - - BranchGroup() - : m_dir(SENSE), m_maxNumBranches(0), - m_noExt(false), m_status(BGS_ACTIVE) - { } - - BranchGroup(extDirection dir, size_t maxNumBranches, - const BranchRecord::V &origin) - : m_dir(dir), m_origin(origin), - m_maxNumBranches(maxNumBranches), m_noExt(false), - m_status(BGS_ACTIVE) - { - m_branches.reserve(m_maxNumBranches); - } + public: + typedef std::vector BranchGroupData; + typedef BranchGroupData::iterator iterator; + typedef BranchGroupData::const_iterator const_iterator; + + BranchGroup() + : m_dir(SENSE) + , m_maxNumBranches(0) + , m_noExt(false) + , m_status(BGS_ACTIVE) + {} + + BranchGroup(extDirection dir, size_t maxNumBranches, const BranchRecord::V& origin) + : m_dir(dir) + , m_origin(origin) + , m_maxNumBranches(maxNumBranches) + , m_noExt(false) + , m_status(BGS_ACTIVE) + { + m_branches.reserve(m_maxNumBranches); + } - BranchGroup(extDirection dir, size_t maxNumBranches, - const BranchRecord::V &origin, const BranchRecord& branch) - : m_dir(dir), m_origin(origin), - m_maxNumBranches(maxNumBranches), m_noExt(false), - m_status(BGS_ACTIVE) - { - m_branches.reserve(m_maxNumBranches); - m_branches.push_back(branch); - } + BranchGroup( + extDirection dir, + size_t maxNumBranches, + const BranchRecord::V& origin, + const BranchRecord& branch) + : m_dir(dir) + , m_origin(origin) + , m_maxNumBranches(maxNumBranches) + , m_noExt(false) + , m_status(BGS_ACTIVE) + { + m_branches.reserve(m_maxNumBranches); + m_branches.push_back(branch); + } - BranchGroup(const BranchGroup& o) - : m_branches(o.m_branches), m_dir(o.m_dir), - m_origin(o.m_origin), - m_maxNumBranches(o.m_maxNumBranches), - m_noExt(o.m_noExt), m_status(o.m_status) - { - m_branches.reserve(m_maxNumBranches); - } + BranchGroup(const BranchGroup& o) + : m_branches(o.m_branches) + , m_dir(o.m_dir) + , m_origin(o.m_origin) + , m_maxNumBranches(o.m_maxNumBranches) + , m_noExt(o.m_noExt) + , m_status(o.m_status) + { + m_branches.reserve(m_maxNumBranches); + } - /** Add a branch to this group. */ - BranchRecord& addBranch(const BranchRecord& branch) - { - assert(m_branches.size() < m_maxNumBranches); - m_branches.push_back(branch); - return m_branches.back(); - } + /** Add a branch to this group. */ + BranchRecord& addBranch(const BranchRecord& branch) + { + assert(m_branches.size() < m_maxNumBranches); + m_branches.push_back(branch); + return m_branches.back(); + } - /** Add a branch to this group and extend the new branch with - * the given k-mer. */ - void addBranch(const BranchRecord& branch, - const BranchRecord::V& kmer) - { - if (m_branches.size() < m_maxNumBranches) - addBranch(branch).push_back( - std::make_pair(kmer, BranchRecord::VP())); - else - m_status = BGS_TOOMANYBRANCHES; - } + /** Add a branch to this group and extend the new branch with + * the given k-mer. */ + void addBranch(const BranchRecord& branch, const BranchRecord::V& kmer) + { + if (m_branches.size() < m_maxNumBranches) + addBranch(branch).push_back(std::make_pair(kmer, BranchRecord::VP())); + else + m_status = BGS_TOOMANYBRANCHES; + } - /** Return the specified branch. */ - BranchRecord& operator [](unsigned id) - { - return m_branches[id]; - } + /** Return the specified branch. */ + BranchRecord& operator[](unsigned id) { return m_branches[id]; } - /** Return the number of branches in this group. */ - size_t size() const { return m_branches.size(); } + /** Return the number of branches in this group. */ + size_t size() const { return m_branches.size(); } - /** Return whether a branch contains the specified k-mer at - * the index i. */ - bool exists(unsigned i, const BranchRecord::V& kmer) const - { - for (BranchGroupData::const_iterator it - = m_branches.begin(); - it != m_branches.end(); ++it) - if (it->exists(i, kmer)) - return true; - return false; - } + /** Return whether a branch contains the specified k-mer at + * the index i. */ + bool exists(unsigned i, const BranchRecord::V& kmer) const + { + for (BranchGroupData::const_iterator it = m_branches.begin(); it != m_branches.end(); ++it) + if (it->exists(i, kmer)) + return true; + return false; + } - // return the current status of the branch - BranchGroupStatus getStatus() const { return m_status; } + // return the current status of the branch + BranchGroupStatus getStatus() const { return m_status; } - // set the no extension flag - void setNoExtension() { m_noExt = true; } + // set the no extension flag + void setNoExtension() { m_noExt = true; } - // is the no extension flag set? - bool isNoExt() const { return m_noExt; } + // is the no extension flag set? + bool isNoExt() const { return m_noExt; } - // return the direction of growth - extDirection getDirection() const { return m_dir; } - - iterator begin() { return m_branches.begin(); } - iterator end() { return m_branches.end(); } - const_iterator begin() const { return m_branches.begin(); } - const_iterator end() const { return m_branches.end(); } - -// Check the stop conditions for the bubble growth -BranchGroupStatus -updateStatus(unsigned maxLength) -{ - assert(m_branches.size() <= m_maxNumBranches); + // return the direction of growth + extDirection getDirection() const { return m_dir; } - if (m_status != BGS_ACTIVE) - return m_status; + iterator begin() { return m_branches.begin(); } + iterator end() { return m_branches.end(); } + const_iterator begin() const { return m_branches.begin(); } + const_iterator end() const { return m_branches.end(); } - // Check if the no extension flag is set - if(m_noExt) + // Check the stop conditions for the bubble growth + BranchGroupStatus updateStatus(unsigned maxLength) { - m_status = BGS_NOEXT; - return m_status; - } + assert(m_branches.size() <= m_maxNumBranches); + + if (m_status != BGS_ACTIVE) + return m_status; - // Check if any branches are too long or any sequence has a loop - for (BranchGroupData::const_iterator iter = m_branches.begin(); - iter != m_branches.end(); ++iter) { - if (iter->isTooLong(maxLength)) { - m_status = BGS_TOOLONG; + // Check if the no extension flag is set + if (m_noExt) { + m_status = BGS_NOEXT; return m_status; } + + // Check if any branches are too long or any sequence has a loop + for (BranchGroupData::const_iterator iter = m_branches.begin(); iter != m_branches.end(); + ++iter) { + if (iter->isTooLong(maxLength)) { + m_status = BGS_TOOLONG; + return m_status; + } + } + + BranchGroupData::const_iterator it = m_branches.begin(); + const BranchRecord::V& lastSeq = it->back().first; + while (++it != m_branches.end()) + if (it->back().first != lastSeq) + return m_status = BGS_ACTIVE; + + // All the branches of the bubble have joined. + // Remove the last base, which is identical for every branch. + std::for_each( + m_branches.begin(), m_branches.end(), [](BranchRecord& b) { return b.pop_back(); }); + + // Sort the branches by coverage. + std::function lambda = [](const BranchRecord& b) { + return b.calculateBranchMultiplicity(); + }; + sort_by_transform(m_branches.begin(), m_branches.end(), lambda); + reverse(m_branches.begin(), m_branches.end()); + + return m_status = BGS_JOINED; } - BranchGroupData::const_iterator it = m_branches.begin(); - const BranchRecord::V& lastSeq = it->back().first; - while (++it != m_branches.end()) - if (it->back().first != lastSeq) - return m_status = BGS_ACTIVE; - - // All the branches of the bubble have joined. - // Remove the last base, which is identical for every branch. - std::for_each(m_branches.begin(), m_branches.end(), - std::mem_fun_ref(&BranchRecord::pop_back)); - - // Sort the branches by coverage. - sort_by_transform(m_branches.begin(), m_branches.end(), - std::mem_fun_ref(&BranchRecord::calculateBranchMultiplicity)); - reverse(m_branches.begin(), m_branches.end()); - - return m_status = BGS_JOINED; -} - -/** Return whether any branches of this group are active. */ -bool -isActive() const -{ - for (BranchGroupData::const_iterator it = m_branches.begin(); - it != m_branches.end(); ++it) - if (it->isActive()) - return true; - return false; -} - -/** Return whether this branch is extendable. */ -bool -isExtendable() -{ - if (m_noExt) + /** Return whether any branches of this group are active. */ + bool isActive() const + { + for (BranchGroupData::const_iterator it = m_branches.begin(); it != m_branches.end(); ++it) + if (it->isActive()) + return true; return false; + } - // A group is extendable when all the branches are the same - // length. All the branches are lockstepped for growth. - BranchGroupData::iterator it = m_branches.begin(); - unsigned length = it++->size(); - for (; it != m_branches.end(); ++it) - if (it->size() != length) + /** Return whether this branch is extendable. */ + bool isExtendable() + { + if (m_noExt) return false; - return true; -} -/** Return whether this branch is ambiguous at its origin. Also - * returns false if the origin of the branch has since been deleted. - */ -bool -isAmbiguous(const SequenceCollectionHash& g) const -{ - // Get fresh data from the collection to check that this bubble - // does in fact still exist. - const BranchRecord::VP& data = g.getSeqAndData(m_origin).second; - return data.deleted() ? false : data.isAmbiguous(m_dir); -} - - private: - BranchGroup& operator =(const BranchGroup& o); - - BranchGroupData m_branches; - extDirection m_dir; - BranchRecord::V m_origin; - size_t m_maxNumBranches; - bool m_noExt; - BranchGroupStatus m_status; + // A group is extendable when all the branches are the same + // length. All the branches are lockstepped for growth. + BranchGroupData::iterator it = m_branches.begin(); + unsigned length = it++->size(); + for (; it != m_branches.end(); ++it) + if (it->size() != length) + return false; + return true; + } + + /** Return whether this branch is ambiguous at its origin. Also + * returns false if the origin of the branch has since been deleted. + */ + bool isAmbiguous(const SequenceCollectionHash& g) const + { + // Get fresh data from the collection to check that this bubble + // does in fact still exist. + const BranchRecord::VP& data = g.getSeqAndData(m_origin).second; + return data.deleted() ? false : data.isAmbiguous(m_dir); + } + + private: + BranchGroup& operator=(const BranchGroup& o); + + BranchGroupData m_branches; + extDirection m_dir; + BranchRecord::V m_origin; + size_t m_maxNumBranches; + bool m_noExt; + BranchGroupStatus m_status; }; #endif diff -Nru abyss-2.2.3/bin/abyss-pe abyss-2.2.4/bin/abyss-pe --- abyss-2.2.3/bin/abyss-pe 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/bin/abyss-pe 2020-01-30 18:48:44.000000000 +0000 @@ -398,7 +398,7 @@ @echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.' version: - @echo "abyss-pe (ABySS) 2.2.3" + @echo "abyss-pe (ABySS) 2.2.4" @echo "Written by Shaun Jackman and Anthony Raymond." @echo @echo "Copyright 2012 Canada's Michael Smith Genome Science Centre" diff -Nru abyss-2.2.3/Bloom/bloom.cc abyss-2.2.4/Bloom/bloom.cc --- abyss-2.2.3/Bloom/bloom.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Bloom/bloom.cc 2020-01-30 18:48:44.000000000 +0000 @@ -99,7 +99,7 @@ " -n, --num-locks=N number of write locks on bloom filter [1000]\n" " -q, --trim-quality=N trim bases from the ends of reads whose\n" " quality is less than the threshold\n" - " -t, --bloom-type=STR 'konnector' or 'rolling-hash' [konnector]\n" + " -t, --bloom-type=STR 'konnector', 'rolling-hash', or 'counting' [konnector]\n" " --standard-quality zero quality is `!' (33)\n" " default for FASTQ and SAM files\n" " --illumina-quality zero quality is `@' (64)\n" diff -Nru abyss-2.2.3/BloomDBG/bloom-dbg.h abyss-2.2.4/BloomDBG/bloom-dbg.h --- abyss-2.2.3/BloomDBG/bloom-dbg.h 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/BloomDBG/bloom-dbg.h 2020-01-30 18:48:44.000000000 +0000 @@ -91,6 +91,24 @@ } /** + * Returns the sum of all kmer multiplicities in `seq` by querying `bloom` + */ +template +inline static unsigned +getSeqAbsoluteKmerCoverage(const Sequence& seq, const CountingBloomT& bloom) +{ + const unsigned k = bloom.getKmerSize(); + const unsigned numHashes = bloom.getHashNum(); + assert(seq.length() >= k); + unsigned coverage = 0; + for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); + ++it) { + coverage += bloom.minCount(*it); + } + return coverage; +} + +/** * Translate a DNA sequence to an equivalent path in the * de Bruijn graph. */ @@ -170,6 +188,8 @@ size_t contigID; /** length of contig (bp) */ unsigned length; + /** coverage of contig */ + unsigned coverage; /** FASTA ID of seeding read */ std::string readID; /** Type of sequence used to seed contig (branch k-mer or full read) */ @@ -433,6 +453,8 @@ inline static void printContig( const Sequence& seq, + unsigned length, + unsigned coverage, size_t contigID, const std::string& readID, unsigned k, @@ -448,6 +470,7 @@ /* add FASTA comment indicating extended read id */ std::ostringstream comment; + comment << length << ' ' << coverage << ' '; comment << "read:" << readID; assert(id.good()); contig.id = id.str(); @@ -510,11 +533,12 @@ * Output a contig sequence if it is not redundant, i.e. it has not already * been generated from a different read / thread of execution. */ -template +template inline static void outputContig( const Path& contigPath, ContigRecord& rec, + SolidKmerSetT& solidKmerSet, AssembledKmerSetT& assembledKmerSet, KmerHash& contigEndKmers, const AssemblyParams& params, @@ -572,15 +596,17 @@ if (!redundant) { #pragma omp critical(fasta) { + rec.length = seq.length(); + rec.coverage = getSeqAbsoluteKmerCoverage(seq, solidKmerSet); + /* add contig to output FASTA */ - printContig(seq, counters.contigID, rec.readID, params.k, streams.out); + printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.out); /* add contig to checkpoint FASTA file */ if (params.checkpointsEnabled()) - printContig(seq, counters.contigID, rec.readID, params.k, streams.checkpointOut); + printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.checkpointOut); rec.contigID = counters.contigID; - rec.length = seq.length(); counters.contigID++; counters.basesAssembled += seq.length(); @@ -842,7 +868,7 @@ /* output contig to FASTA file */ outputContig( - contigPath, contigRec, assembledKmerSet, contigEndKmers, params, counters, streams); + contigPath, contigRec, solidKmerSet, assembledKmerSet, contigEndKmers, params, counters, streams); } /* mark contig k-mers as visited */ diff -Nru abyss-2.2.3/ChangeLog abyss-2.2.4/ChangeLog --- abyss-2.2.3/ChangeLog 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/ChangeLog 2020-01-30 18:48:44.000000000 +0000 @@ -1,3 +1,20 @@ +2020-01-30 Johnathan Wong + + * Release version 2.2.4 + + General: + * Refactor deprecated functions in clang-8 + + Sealer: + * Remove unsupported -D option from help page + + abyss-bloom: + * Add counting Bloom Filter instruction to help page + + abyss-bloom-dbg: + * Report coverage information of unitigs + + 2019-09-20 Johnathan Wong * Release version 2.2.3 diff -Nru abyss-2.2.3/configure.ac abyss-2.2.4/configure.ac --- abyss-2.2.3/configure.ac 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/configure.ac 2020-01-30 18:48:44.000000000 +0000 @@ -1,5 +1,5 @@ AC_PREREQ(2.62) -AC_INIT(ABySS, 2.2.3, abyss-users@bcgsc.ca, abyss, +AC_INIT(ABySS, 2.2.4, abyss-users@bcgsc.ca, abyss, http://www.bcgsc.ca/platform/bioinfo/software/abyss) AC_CONFIG_MACRO_DIR([m4]) diff -Nru abyss-2.2.3/debian/changelog abyss-2.2.4/debian/changelog --- abyss-2.2.3/debian/changelog 2019-12-14 23:00:48.000000000 +0000 +++ abyss-2.2.4/debian/changelog 2020-02-11 14:39:45.000000000 +0000 @@ -1,3 +1,11 @@ +abyss (2.2.4-1) unstable; urgency=medium + + * New upstream version + * spelling patch incorporated upstream, removed it + * Remove markdown from the nodoc build profile, to allow for cross-building + + -- Michael R. Crusoe Tue, 11 Feb 2020 15:39:45 +0100 + abyss (2.2.3-2) unstable; urgency=medium * Use libtext-multimarkdown-perl to obtain html documentation diff -Nru abyss-2.2.3/debian/control abyss-2.2.4/debian/control --- abyss-2.2.3/debian/control 2019-12-14 23:00:48.000000000 +0000 +++ abyss-2.2.4/debian/control 2020-02-11 14:39:33.000000000 +0000 @@ -8,14 +8,14 @@ libboost-graph-dev, libopenmpi-dev, libsparsehash-dev, - libtext-multimarkdown-perl, + libtext-multimarkdown-perl , libsqlite3-dev, pkg-config, help2man Standards-Version: 4.4.1 Vcs-Browser: https://salsa.debian.org/med-team/abyss Vcs-Git: https://salsa.debian.org/med-team/abyss.git -Homepage: http://www.bcgsc.ca/platform/bioinfo/software/abyss +Homepage: https://github.com/bcgsc/abyss Package: abyss Architecture: any-amd64 arm64 armel armhf any-i386 mips64el mipsel ppc64el s390x alpha ia64 m68k powerpcspe ppc64 riscv64 sh4 sparc64 x32 diff -Nru abyss-2.2.3/debian/patches/series abyss-2.2.4/debian/patches/series --- abyss-2.2.3/debian/patches/series 2019-12-14 23:00:48.000000000 +0000 +++ abyss-2.2.4/debian/patches/series 2020-01-30 19:53:06.000000000 +0000 @@ -1,4 +1,3 @@ abyss-pe.patch # gtest.patch #abyss-32-bit.patch -spelling diff -Nru abyss-2.2.3/debian/patches/spelling abyss-2.2.4/debian/patches/spelling --- abyss-2.2.3/debian/patches/spelling 2019-12-14 23:00:48.000000000 +0000 +++ abyss-2.2.4/debian/patches/spelling 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -From: Michael R. Crusoe -Subject: typo fixes, courtesy lintian -Forwarded: https://github.com/bcgsc/abyss/pull/301 ---- abyss.orig/Sealer/README.md -+++ abyss/Sealer/README.md -@@ -24,7 +24,7 @@ - Description - =========== - --Sealer is an application of Konnector that closes intra-scaffold gaps. It performs three sequential functions. First, regions with Ns are identified from an input scaffold. Flanking nucleotues (2 x 100bp) are extracted from those regions while respecting the strand (5' to 3') direction on the sequence immediately downstream of each gap. In the second step, flanking sequence pairs are used as input to Konnector along with a set of reads with a high level of coverage redundancy. Ideally, the reads should represent the original dataset from which the draft assembly is generated, or further whole genome shotgun (WGS) sequencing data generated from the same sample. Within Konnector, the input WGS reads are used to populate a Bloom filter, tiling the reads with a sliding window of length *k*, thus generating a probabilistic representation of all the *k*-mers in the reads. Konnector also uses crude error removal and correctional algorithms, eliminating singletons (*k*-mers that are observed only once) and fixing base mismatches in the flanking sequence pairs. Sealer launches Konnector processes using a user-input range of *k*-mer lengths. In the third and final operation, succesfully merged sequences are inserted into the gaps of the original scaffolds, and Sealer outputs a new gap-filled scaffold file. -+Sealer is an application of Konnector that closes intra-scaffold gaps. It performs three sequential functions. First, regions with Ns are identified from an input scaffold. Flanking nucleotues (2 x 100bp) are extracted from those regions while respecting the strand (5' to 3') direction on the sequence immediately downstream of each gap. In the second step, flanking sequence pairs are used as input to Konnector along with a set of reads with a high level of coverage redundancy. Ideally, the reads should represent the original dataset from which the draft assembly is generated, or further whole genome shotgun (WGS) sequencing data generated from the same sample. Within Konnector, the input WGS reads are used to populate a Bloom filter, tiling the reads with a sliding window of length *k*, thus generating a probabilistic representation of all the *k*-mers in the reads. Konnector also uses crude error removal and correctional algorithms, eliminating singletons (*k*-mers that are observed only once) and fixing base mismatches in the flanking sequence pairs. Sealer launches Konnector processes using a user-input range of *k*-mer lengths. In the third and final operation, successfully merged sequences are inserted into the gaps of the original scaffolds, and Sealer outputs a new gap-filled scaffold file. - - Installation - ============ -@@ -159,6 +159,6 @@ - * `--help`: display this help and exit - * `--version`: output version information and exit - --*k* is the size of *k*-mer for the de Bruijn graph. You may specify multiple values of *k*, which will increase the nubmer of gaps closed at the cost of increased run time. Multiple values of *k* ought to be specified in increasing order, as lower values of *k* have fewer coverage gaps and are less likely to misassemble. -+*k* is the size of *k*-mer for the de Bruijn graph. You may specify multiple values of *k*, which will increase the number of gaps closed at the cost of increased run time. Multiple values of *k* ought to be specified in increasing order, as lower values of *k* have fewer coverage gaps and are less likely to misassemble. - - *P* is the threshold for number of paths allowed to be traversed. When set to 10, Konnector will attempt to close gaps even when there are 10 different paths found. It would attempt to create a consensus sequence between these paths. The default setting is 2. ---- abyss.orig/doc/abyss-pe.1 -+++ abyss/doc/abyss-pe.1 -@@ -113,7 +113,7 @@ - SS=--SS to assemble in strand-specific mode - .br - Requires that all libraries are strand-specific RNA-Seq libraries. --Assumes that the first read in a read pair is reveresed WRT the -+Assumes that the first read in a read pair is reversed WRT the - transcripts sequenced. - .TP - .B t -@@ -242,7 +242,7 @@ - - .SS "MPI COMPATIBILITY" - Due to its use of multi-threading, DIDA has known deadlocking issues --with OpenMPI. Using the MPICH MPI library is strongly recommmended -+with OpenMPI. Using the MPICH MPI library is strongly recommended - when running assemblies with DIDA. Testing was done with MPICH 3.1.3, - compiled with --enable-threads=funneled. - diff -Nru abyss-2.2.3/debian/rules abyss-2.2.4/debian/rules --- abyss-2.2.3/debian/rules 2019-12-14 23:00:48.000000000 +0000 +++ abyss-2.2.4/debian/rules 2020-02-11 14:20:02.000000000 +0000 @@ -15,7 +15,9 @@ override_dh_auto_build: dh_auto_build +ifeq (,$(filter nodoc,$(DEB_BUILD_PROFILES) $(DEB_BUILD_OPTIONS))) markdown README.md > README.html +endif override_dh_auto_install: dh_auto_install --destdir=debian/tmp diff -Nru abyss-2.2.3/debian/upstream/edam abyss-2.2.4/debian/upstream/edam --- abyss-2.2.3/debian/upstream/edam 1970-01-01 00:00:00.000000000 +0000 +++ abyss-2.2.4/debian/upstream/edam 2020-01-30 19:52:03.000000000 +0000 @@ -0,0 +1,9 @@ +--- +ontology: EDAM (1.23) +topic: + - Sequence assembly +scopes: + - name: summary + function: + - Genome assembly + - De-novo assembly \ No newline at end of file diff -Nru abyss-2.2.3/debian/upstream/metadata abyss-2.2.4/debian/upstream/metadata --- abyss-2.2.3/debian/upstream/metadata 2019-12-14 23:00:48.000000000 +0000 +++ abyss-2.2.4/debian/upstream/metadata 2020-01-30 19:52:03.000000000 +0000 @@ -1,5 +1,20 @@ Reference: - Author: > +- Author: > + Shaun D. Jackman and Benjamin P. Vandervalk and Hamid Mohamadi + and Justin Chu and Sarah Yeo and S. Austin Hammond and Golnaz Jahesh + and Hamza Khan and Lauren Coombe and Rene L. Warren and İnanç Birol + Title: > + "ABySS 2.0: resource-efficient assembly of large genomes using a Bloom + filter" + Journal: Genome Research + Year: 2017 + Volume: 27 + Number: 5 + Pages: 768-777 + PMID: 28232478 + DOI: 10.1101/gr.214346.116 + eprint: https://genome.cshlp.org/content/27/5/768.full.pdf +- Author: > Jared T. Simpson and Kim Wong and Shaun D. Jackman and Jacqueline E. Schein and Steven J.M. Jones and İnanç Birol Title: "ABySS: A parallel assembler for short read sequence data" @@ -11,7 +26,7 @@ PMID: 19251739 DOI: 10.1101/gr.089532.108 URL: http://genome.cshlp.org/content/19/6/1117 - eprint: http://genome.cshlp.org/content/19/6/1117.full.pdf+html + eprint: http://genome.cshlp.org/content/19/6/1117.full.pdf Registry: - Name: bio.tools Entry: ABySS diff -Nru abyss-2.2.3/doc/ABYSS.1 abyss-2.2.4/doc/ABYSS.1 --- abyss-2.2.3/doc/ABYSS.1 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/doc/ABYSS.1 2020-01-30 18:48:44.000000000 +0000 @@ -1,4 +1,4 @@ -.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.2.3" "User Commands" +.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.2.4" "User Commands" .SH NAME ABYSS \- assemble short reads into contigs .SH SYNOPSIS diff -Nru abyss-2.2.3/doc/abyss-pe.1 abyss-2.2.4/doc/abyss-pe.1 --- abyss-2.2.3/doc/abyss-pe.1 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/doc/abyss-pe.1 2020-01-30 18:48:44.000000000 +0000 @@ -1,4 +1,4 @@ -.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.2.3" "User Commands" +.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.2.4" "User Commands" .SH NAME abyss-pe - assemble reads into contigs .SH SYNOPSIS @@ -113,7 +113,7 @@ SS=--SS to assemble in strand-specific mode .br Requires that all libraries are strand-specific RNA-Seq libraries. -Assumes that the first read in a read pair is reveresed WRT the +Assumes that the first read in a read pair is reversed WRT the transcripts sequenced. .TP .B t @@ -242,7 +242,7 @@ .SS "MPI COMPATIBILITY" Due to its use of multi-threading, DIDA has known deadlocking issues -with OpenMPI. Using the MPICH MPI library is strongly recommmended +with OpenMPI. Using the MPICH MPI library is strongly recommended when running assemblies with DIDA. Testing was done with MPICH 3.1.3, compiled with --enable-threads=funneled. diff -Nru abyss-2.2.3/doc/abyss-tofastq.1 abyss-2.2.4/doc/abyss-tofastq.1 --- abyss-2.2.3/doc/abyss-tofastq.1 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/doc/abyss-tofastq.1 2020-01-30 18:48:44.000000000 +0000 @@ -1,4 +1,4 @@ -.TH abyss-tofastq "1" "2015-May" "ABySS 2.2.3" "User Commands" +.TH abyss-tofastq "1" "2015-May" "ABySS 2.2.4" "User Commands" .SH NAME abyss-tofastq \- convert various file formats to FASTQ format .br diff -Nru abyss-2.2.3/FilterGraph/FilterGraph.cc abyss-2.2.4/FilterGraph/FilterGraph.cc --- abyss-2.2.3/FilterGraph/FilterGraph.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/FilterGraph/FilterGraph.cc 2020-01-30 18:48:44.000000000 +0000 @@ -9,16 +9,16 @@ #include "ContigPath.h" #include "ContigProperties.h" #include "FastaReader.h" -#include "IOUtil.h" -#include "Uncompress.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" +#include "IOUtil.h" +#include "Uncompress.h" +#include #include #include -#include #include #include #include @@ -37,153 +37,159 @@ #define PROGRAM "abyss-filtergraph" static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Tony Raymond.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Tony Raymond.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... ADJ [FASTA]\n" -"Remove short contigs that do not contribute any relevant\n" -"information to the assembly.\n" -"\n" -" Arguments:\n" -"\n" -" ADJ contig adjacency graph\n" -" FASTA contigs to check consistency of ADJ edges\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=N k-mer size\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation\n" -" -T, --island=N remove islands shorter than N [0]\n" -" -t, --tip=N remove tips shorter than N [0]\n" -" -l, --length=N remove contigs shorter than N [0]\n" -" -L, --max-length=N remove contigs longer than N [0]\n" -" -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n" -" -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n" -" --shim remove filler contigs that only contribute\n" -" to adjacency [default]\n" -" --no-shim disable filler contigs removal\n" -" --shim-max-degree=N only remove shims where the smaller of \n" -" in/out degree is smaller than N [1]\n" -" -m, --min-overlap=N require a minimum overlap of N bases [10]\n" -" --assemble assemble unambiguous paths\n" -" --no-assemble disable assembling of paths [default]\n" -" -g, --graph=FILE write the contig adjacency graph to FILE\n" -" -i, --ignore=FILE ignore contigs seen in FILE\n" -" -r, --remove=FILE remove contigs seen in FILE\n" -" --adj output the graph in ADJ format [default]\n" -" --asqg output the graph in ASQG format\n" -" --dot output the graph in GraphViz format\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... ADJ [FASTA]\n" + "Remove short contigs that do not contribute any relevant\n" + "information to the assembly.\n" + "\n" + " Arguments:\n" + "\n" + " ADJ contig adjacency graph\n" + " FASTA contigs to check consistency of ADJ edges\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=N k-mer size\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation\n" + " -T, --island=N remove islands shorter than N [0]\n" + " -t, --tip=N remove tips shorter than N [0]\n" + " -l, --length=N remove contigs shorter than N [0]\n" + " -L, --max-length=N remove contigs longer than N [0]\n" + " -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n" + " -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n" + " --shim remove filler contigs that only contribute\n" + " to adjacency [default]\n" + " --no-shim disable filler contigs removal\n" + " --shim-max-degree=N only remove shims where the smaller of \n" + " in/out degree is smaller than N [1]\n" + " -m, --min-overlap=N require a minimum overlap of N bases [10]\n" + " --assemble assemble unambiguous paths\n" + " --no-assemble disable assembling of paths [default]\n" + " -g, --graph=FILE write the contig adjacency graph to FILE\n" + " -i, --ignore=FILE ignore contigs seen in FILE\n" + " -r, --remove=FILE remove contigs seen in FILE\n" + " --adj output the graph in ADJ format [default]\n" + " --asqg output the graph in ASQG format\n" + " --dot output the graph in GraphViz format\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Remove island contigs less than this length. */ - static unsigned minIslandLen = 0; +/** Remove island contigs less than this length. */ +static unsigned minIslandLen = 0; - /** Remove tips less than this length. */ - static unsigned minTipLen = 0; +/** Remove tips less than this length. */ +static unsigned minTipLen = 0; - /** Remove all contigs less than this length. */ - static unsigned minLen = 0; +/** Remove all contigs less than this length. */ +static unsigned minLen = 0; - /** Remove all contigs more than this length. */ - static unsigned maxLen = 0; +/** Remove all contigs more than this length. */ +static unsigned maxLen = 0; - /** Remove contigs with mean k-mer coverage less than this threshold. */ - static float minCoverage = 0; +/** Remove contigs with mean k-mer coverage less than this threshold. */ +static float minCoverage = 0; - /** Remove contigs with mean k-mer coverage at least this threshold. */ - static float maxCoverage = 0; +/** Remove contigs with mean k-mer coverage at least this threshold. */ +static float maxCoverage = 0; - /** Remove short contigs that don't contribute any sequence. */ - static int shim = 1; +/** Remove short contigs that don't contribute any sequence. */ +static int shim = 1; - /** Only remove shims where the smaller of in/out degree is small - * enough. */ - static unsigned shimMaxDegree = 1; +/** Only remove shims where the smaller of in/out degree is small + * enough. */ +static unsigned shimMaxDegree = 1; - /** Assemble unambiguous paths. */ - static int assemble = 0; +/** Assemble unambiguous paths. */ +static int assemble = 0; - /** Write the contig adjacency graph to this file. */ - static string graphPath; +/** Write the contig adjacency graph to this file. */ +static string graphPath; - /** Contigs to ignore. */ - static string ignorePath; +/** Contigs to ignore. */ +static string ignorePath; - /** Contigs to remove. */ - static string removePath; +/** Contigs to remove. */ +static string removePath; - /** The minimum overlap allowed between two contigs. */ - static int minOverlap = 10; +/** The minimum overlap allowed between two contigs. */ +static int minOverlap = 10; - /** Output graph format. */ - int format = ADJ; // used by ContigProperties +/** Output graph format. */ +int format = ADJ; // used by ContigProperties } static const char shortopts[] = "c:C:g:i:r:k:l:L:m:t:T:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_SHIM_MAX_DEG }; +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_SHIM_MAX_DEG +}; static const struct option longopts[] = { - { "adj", no_argument, &opt::format, ADJ }, - { "asqg", no_argument, &opt::format, ASQG }, - { "dot", no_argument, &opt::format, DOT }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "graph", required_argument, NULL, 'g' }, - { "ignore", required_argument, NULL, 'i' }, - { "remove", required_argument, NULL, 'r' }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "kmer", required_argument, NULL, 'k' }, - { "island", required_argument, NULL, 'T' }, - { "tip", required_argument, NULL, 't' }, - { "length", required_argument, NULL, 'l' }, - { "max-length", required_argument, NULL, 'L' }, - { "coverage", required_argument, NULL, 'c' }, - { "max-coverage", required_argument, NULL, 'C' }, - { "shim", no_argument, &opt::shim, 1 }, - { "no-shim", no_argument, &opt::shim, 0 }, + { "adj", no_argument, &opt::format, ADJ }, + { "asqg", no_argument, &opt::format, ASQG }, + { "dot", no_argument, &opt::format, DOT }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "graph", required_argument, NULL, 'g' }, + { "ignore", required_argument, NULL, 'i' }, + { "remove", required_argument, NULL, 'r' }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "kmer", required_argument, NULL, 'k' }, + { "island", required_argument, NULL, 'T' }, + { "tip", required_argument, NULL, 't' }, + { "length", required_argument, NULL, 'l' }, + { "max-length", required_argument, NULL, 'L' }, + { "coverage", required_argument, NULL, 'c' }, + { "max-coverage", required_argument, NULL, 'C' }, + { "shim", no_argument, &opt::shim, 1 }, + { "no-shim", no_argument, &opt::shim, 0 }, { "shim-max-degree", required_argument, NULL, OPT_SHIM_MAX_DEG }, - { "assemble", no_argument, &opt::assemble, 1 }, - { "no-assemble", no_argument, &opt::assemble, 0 }, - { "min-overlap", required_argument, NULL, 'm' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, + { "assemble", no_argument, &opt::assemble, 1 }, + { "no-assemble", no_argument, &opt::assemble, 0 }, + { "min-overlap", required_argument, NULL, 'm' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, { NULL, 0, NULL, 0 } }; static vector g_removed; /** Contig adjacency graph. */ -typedef ContigGraph > Graph; +typedef ContigGraph> Graph; typedef Graph::vertex_descriptor vertex_descriptor; typedef Graph::edge_descriptor edge_descriptor; /** Data for verbose output. */ -static struct { +static struct +{ unsigned removed; unsigned tails; unsigned too_long; @@ -194,7 +200,8 @@ } g_count; /** Returns if the contig can be removed from the graph. */ -static bool removable(const Graph* pg, vertex_descriptor v) +static bool +removable(const Graph* pg, vertex_descriptor v) { typedef graph_traits GTraits; typedef GTraits::out_edge_iterator OEit; @@ -248,8 +255,7 @@ if (g[*maxvw].distance < g[*vw].distance) maxvw = vw; - if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance > - -opt::minOverlap) { + if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance > -opt::minOverlap) { g_count.too_long++; return false; } @@ -257,20 +263,32 @@ } /** Data to store information of an edge. */ -struct EdgeInfo { +struct EdgeInfo +{ vertex_descriptor u; vertex_descriptor w; edge_bundle_type::type ep; EdgeInfo(vertex_descriptor u, vertex_descriptor w, int ep) - : u(u), w(w), ep(ep) {} - EdgeInfo() : u(), w(), ep() {} + : u(u) + , w(w) + , ep(ep) + {} + EdgeInfo() + : u() + , w() + , ep() + {} }; /** Returns a list of edges that may be added when the vertex v is * removed. */ -static bool findNewEdges(const Graph& g, vertex_descriptor v, - vector& eds, vector& markedContigs) +static bool +findNewEdges( + const Graph& g, + vertex_descriptor v, + vector& eds, + vector& markedContigs) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -288,8 +306,7 @@ // for every edge from u->v and v->w we must add an edge u->w for (IEit uv = iei0; uv != iei1; ++uv) { for (OEit vw = oei0; vw != oei1; ++vw) { - int x = g[*uv].distance + (int)g[v].length + - g[*vw].distance; + int x = g[*uv].distance + (int)g[v].length + g[*vw].distance; assert(x <= 0); EdgeInfo ed(source(*uv, g), target(*vw, g), x); eds.push_back(ed); @@ -299,17 +316,16 @@ marked.push_back(ed.w); } } - for (vector::const_iterator it = marked.begin(); - it != marked.end(); it++) + for (vector::const_iterator it = marked.begin(); it != marked.end(); it++) markedContigs[get(vertex_index, g, *it)] = true; return true; } /** Adds all edges described in the vector eds. */ -static void addNewEdges(Graph& g, const vector& eds) +static void +addNewEdges(Graph& g, const vector& eds) { - for (vector::const_iterator edsit = eds.begin(); - edsit != eds.end(); ++edsit) { + for (vector::const_iterator edsit = eds.begin(); edsit != eds.end(); ++edsit) { // Don't add parallel edges! This can happen when removing a palindrome. if (edge(edsit->u, edsit->w, g).second) { g_count.parallel_edge++; @@ -320,16 +336,18 @@ } } -static void removeContig(vertex_descriptor v, Graph& g) +static void +removeContig(vertex_descriptor v, Graph& g) { - clear_vertex(v, g); - remove_vertex(v, g); - g_removed.push_back(get(vertex_contig_index, g, v)); - g_count.removed++; + clear_vertex(v, g); + remove_vertex(v, g); + g_removed.push_back(get(vertex_contig_index, g, v)); + g_count.removed++; } /** Remove the specified contig from the adjacency graph. */ -static void removeContigs(Graph& g, vector& sc) +static void +removeContigs(Graph& g, vector& sc) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -338,13 +356,11 @@ out.reserve(sc.size()); vector markedContigs(g.num_vertices()); - for (vector::iterator it = sc.begin(); - it != sc.end(); ++it) { + for (vector::iterator it = sc.begin(); it != sc.end(); ++it) { V v = *it; if (opt::verbose > 0 && ++g_count.checked % 10000000 == 0) - cerr << "Removed " << g_count.removed << "/" - << g_count.checked - << " vertices that have been checked.\n"; + cerr << "Removed " << g_count.removed << "/" << g_count.checked + << " vertices that have been checked.\n"; if (markedContigs[get(vertex_index, g, v)]) { out.push_back(v); @@ -366,40 +382,47 @@ } /** Return the value of the bit at the specified index. */ -struct Marked : unary_function { +struct Marked : unary_function +{ typedef vector Data; Marked(const Graph& g, const Data& data) - : m_g(g), m_data(data) { } - bool operator()(vertex_descriptor u) const - { - return m_data[get(vertex_contig_index, m_g, u)]; - } + : m_g(g) + , m_data(data) + {} + bool operator()(vertex_descriptor u) const { return m_data[get(vertex_contig_index, m_g, u)]; } + private: const Graph& m_g; const Data& m_data; }; /** Finds all potentially removable contigs in the graph. */ -static void findShortContigs(const Graph& g, const vector& seen, - vector& sc) +static void +findShortContigs(const Graph& g, const vector& seen, vector& sc) { typedef graph_traits GTraits; typedef GTraits::vertex_iterator Vit; Vit first, second; tie(first, second) = vertices(g); - ::copy_if(first, second, back_inserter(sc), - !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1)); + ::copy_if( + first, + second, + back_inserter(sc), + !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1)); } /** Functor used for sorting contigs based on degree, then size, * and then ID. */ -struct sortContigs { +struct sortContigs +{ const Graph& g; - sortContigs(const Graph& g) : g(g) {} + sortContigs(const Graph& g) + : g(g) + {} - template - bool operator() (V a, V b) + template + bool operator()(V a, V b) { const ContigProperties& ap = g[a]; const ContigProperties& bp = g[b]; @@ -407,60 +430,71 @@ unsigned dega = out_degree(a, g) * in_degree(a, g); unsigned degb = out_degree(b, g) * in_degree(b, g); - return dega != degb ? dega < degb - : ap.length != bp.length ? ap.length < bp.length - : a < b; + return dega != degb ? dega < degb : ap.length != bp.length ? ap.length < bp.length : a < b; } }; -struct ShorterThanX : unary_function { +struct ShorterThanX : unary_function +{ const Graph& g; const vector& seen; size_t x; ShorterThanX(const Graph& g, const vector& seen, size_t x) - : g(g), seen(seen), x(x) { } + : g(g) + , seen(seen) + , x(x) + {} bool operator()(vertex_descriptor y) const { - return g[y].length < x && !get(vertex_removed, g, y) - && !seen[get(vertex_contig_index, g, y)]; + return g[y].length < x && !get(vertex_removed, g, y) && + !seen[get(vertex_contig_index, g, y)]; } }; -struct LongerThanX : unary_function { +struct LongerThanX : unary_function +{ const Graph& g; const vector& seen; size_t x; LongerThanX(const Graph& g, const vector& seen, size_t x) - : g(g), seen(seen), x(x) { } + : g(g) + , seen(seen) + , x(x) + {} bool operator()(vertex_descriptor y) const { - return g[y].length > x && !get(vertex_removed, g, y) - && !seen[get(vertex_contig_index, g, y)]; + return g[y].length > x && !get(vertex_removed, g, y) && + !seen[get(vertex_contig_index, g, y)]; } }; -struct CoverageLessThan : unary_function { +struct CoverageLessThan : unary_function +{ const Graph& g; const vector& seen; float minCov; CoverageLessThan(const Graph& g, const vector& seen, float minCov) - : g(g), seen(seen), minCov(minCov) { } + : g(g) + , seen(seen) + , minCov(minCov) + {} bool operator()(vertex_descriptor u) const { assert(opt::k > 0); float meanCoverage = (float)g[u].coverage / (g[u].length - opt::k + 1); - return meanCoverage < minCov && !get(vertex_removed, g, u) - && !seen[get(vertex_contig_index, g, u)]; + return meanCoverage < minCov && !get(vertex_removed, g, u) && + !seen[get(vertex_contig_index, g, u)]; } }; -static void removeShims(Graph& g, const vector& seen) +static void +removeShims(Graph& g, const vector& seen) { if (opt::verbose > 0) cerr << "Removing shim contigs from the graph...\n"; @@ -468,25 +502,22 @@ findShortContigs(g, seen, shortContigs); for (unsigned i = 0; !shortContigs.empty(); ++i) { if (opt::verbose > 0) - cerr << "Pass " << i + 1 << ": Checking " - << shortContigs.size() << " contigs.\n"; - sort(shortContigs.begin(), shortContigs.end(), - sortContigs(g)); + cerr << "Pass " << i + 1 << ": Checking " << shortContigs.size() << " contigs.\n"; + sort(shortContigs.begin(), shortContigs.end(), sortContigs(g)); removeContigs(g, shortContigs); } if (opt::verbose > 0) { cerr << "Shim removal stats:\n"; - cerr << "Removed: " << g_count.removed/2 - << " Too Complex: " << g_count.too_complex/2 - << " Tails: " << g_count.tails/2 - << " Too Long: " << g_count.too_long/2 - << " Self Adjacent: " << g_count.self_adj/2 - << " Parallel Edges: " << g_count.parallel_edge/2 << '\n'; + cerr << "Removed: " << g_count.removed / 2 << " Too Complex: " << g_count.too_complex / 2 + << " Tails: " << g_count.tails / 2 << " Too Long: " << g_count.too_long / 2 + << " Self Adjacent: " << g_count.self_adj / 2 + << " Parallel Edges: " << g_count.parallel_edge / 2 << '\n'; } } -template -static void removeContigs_if(Graph& g, pred p) +template +static void +removeContigs_if(Graph& g, pred p) { typedef graph_traits GTraits; typedef GTraits::vertex_iterator Vit; @@ -496,10 +527,11 @@ vector sc; ::copy_if(first, second, back_inserter(sc), p); remove_vertex_if(g, sc.begin(), sc.end(), True()); - transform(sc.begin(), sc.end(), back_inserter(g_removed), - mem_fun_ref(&ContigNode::contigIndex)); + transform(sc.begin(), sc.end(), back_inserter(g_removed), [](const ContigNode& c) { + return c.contigIndex(); + }); if (opt::verbose > 0) - cerr << "Removed " << sc.size()/2 << " contigs.\n"; + cerr << "Removed " << sc.size() / 2 << " contigs.\n"; } /** Contig sequences. */ @@ -507,7 +539,8 @@ static Contigs g_contigs; /** Return the sequence of vertex u. */ -static string getSequence(const Graph& g, vertex_descriptor u) +static string +getSequence(const Graph& g, vertex_descriptor u) { size_t i = get(vertex_contig_index, g, u); assert(i < g_contigs.size()); @@ -516,11 +549,13 @@ } /** Return whether the specified edge is inconsistent. */ -struct is_edge_inconsistent : unary_function { +struct is_edge_inconsistent : unary_function +{ const Graph& g; is_edge_inconsistent(const Graph& g) - : g(g) { } + : g(g) + {} bool operator()(edge_descriptor e) const { @@ -541,15 +576,17 @@ } }; -template -static void remove_edge(Graph& g, It first, It last) +template +static void +remove_edge(Graph& g, It first, It last) { for (; first != last; first++) remove_edge(*first, g); } template -static void removeEdges_if(Graph& g, pred p) +static void +removeEdges_if(Graph& g, pred p) { typedef graph_traits GTraits; typedef GTraits::edge_iterator Eit; @@ -565,86 +602,86 @@ } } -int main(int argc, char** argv) +int +main(int argc, char** argv) { string commandLine; { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': + case '?': die = true; break; - case 'c': + case 'c': arg >> opt::minCoverage; break; - case 'C': + case 'C': arg >> opt::maxCoverage; break; - case 'l': + case 'l': arg >> opt::minLen; break; - case 'L': + case 'L': arg >> opt::maxLen; break; - case 'm': + case 'm': arg >> opt::minOverlap; break; - case 'g': + case 'g': arg >> opt::graphPath; break; - case 'i': + case 'i': arg >> opt::ignorePath; break; - case 'r': + case 'r': arg >> opt::removePath; break; - case 'k': + case 'k': arg >> opt::k; break; - case 'T': + case 'T': arg >> opt::minIslandLen; break; - case 't': + case 't': arg >> opt::minTipLen; break; - case 'v': + case 'v': opt::verbose++; break; - case OPT_SHIM_MAX_DEG: + case OPT_SHIM_MAX_DEG: arg >> opt::shimMaxDegree; break; - case OPT_HELP: + case OPT_HELP: cout << USAGE_MESSAGE; exit(EXIT_SUCCESS); - case OPT_VERSION: + case OPT_VERSION: cout << VERSION_MESSAGE; exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::minOverlap < 0) { cerr << PROGRAM ": " - << "--min-overlap must be a positive integer.\n"; + << "--min-overlap must be a positive integer.\n"; die = true; } if (opt::k <= 0) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } @@ -659,8 +696,7 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -669,8 +705,7 @@ { string adjPath(argv[optind++]); if (opt::verbose > 0) - cerr << "Loading graph from file: " << adjPath - << '\n'; + cerr << "Loading graph from file: " << adjPath << '\n'; ifstream fin(adjPath.c_str()); assert_good(fin, adjPath); fin >> g; @@ -698,12 +733,11 @@ size_t b = g_removed.size(); while (in >> s) { size_t i = get(g_contigNames, s); - removeContig(ContigNode(i,0), g); + removeContig(ContigNode(i, 0), g); } assert(in.eof()); if (opt::verbose) - cerr << "Removed " << g_removed.size() - b - << " contigs.\n"; + cerr << "Removed " << g_removed.size() - b << " contigs.\n"; } // Remove shims. @@ -713,11 +747,9 @@ // Remove islands. if (opt::minIslandLen > 0) { size_t s = g_removed.size(); - removeIslands_if(g, back_inserter(g_removed), - ShorterThanX(g, seen, opt::minIslandLen)); + removeIslands_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minIslandLen)); if (opt::verbose) - cerr << "Removed " << g_removed.size() - s - << " islands.\n"; + cerr << "Removed " << g_removed.size() - s << " islands.\n"; } // Remove tips. @@ -726,12 +758,10 @@ s = g_removed.size(); do { prev = g_removed.size(); - pruneTips_if(g, back_inserter(g_removed), - ShorterThanX(g, seen, opt::minTipLen)); + pruneTips_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minTipLen)); } while (prev < g_removed.size()); if (opt::verbose) - cerr << "Removed " << g_removed.size() - s - << " tips.\n"; + cerr << "Removed " << g_removed.size() - s << " tips.\n"; } // Remove short contigs. @@ -748,8 +778,7 @@ // Remove contigs with high mean k-mer coverage. if (opt::maxCoverage > 0) - removeContigs_if(g, - std::not1(CoverageLessThan(g, seen, opt::maxCoverage))); + removeContigs_if(g, std::not1(CoverageLessThan(g, seen, opt::maxCoverage))); // Remove inconsistent edges of spaceseeds if (argc - optind == 1) { @@ -793,8 +822,7 @@ else assemble(g, back_inserter(paths)); g_contigNames.unlock(); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { ContigNode u(numContigs + it - paths.begin(), false); string name = createContigName(); put(vertex_name, g, u, name); diff -Nru abyss-2.2.3/FMIndex/BitArrays.h abyss-2.2.4/FMIndex/BitArrays.h --- abyss-2.2.3/FMIndex/BitArrays.h 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/FMIndex/BitArrays.h 2020-01-30 18:48:44.000000000 +0000 @@ -23,94 +23,84 @@ static T SENTINEL() { return std::numeric_limits::max(); } public: + /** Count the occurrences of the symbols of [first, last). */ + template + void assign(It first, It last) + { + assert(first < last); + m_data.clear(); + + // Determine the size of the alphabet ignoring the sentinel. + T n = 0; + for (It it = first; it != last; ++it) + if (*it != SENTINEL()) + n = std::max(n, *it); + n++; + + assert(n < std::numeric_limits::max()); + m_data.resize(n, wat_array::BitArray(last - first)); + + size_t i = 0; + for (It it = first; it != last; ++it, ++i) { + T c = *it; + if (c == SENTINEL()) + continue; + assert(c < m_data.size()); + m_data[c].SetBit(1, i); + } -/** Count the occurrences of the symbols of [first, last). */ -template -void assign(It first, It last) -{ - assert(first < last); - m_data.clear(); - - // Determine the size of the alphabet ignoring the sentinel. - T n = 0; - for (It it = first; it != last; ++it) - if (*it != SENTINEL()) - n = std::max(n, *it); - n++; - - assert(n < std::numeric_limits::max()); - m_data.resize(n, wat_array::BitArray(last - first)); - - size_t i = 0; - for (It it = first; it != last; ++it, ++i) { - T c = *it; - if (c == SENTINEL()) - continue; - assert(c < m_data.size()); - m_data[c].SetBit(1, i); + std::for_each( + m_data.begin(), m_data.end(), [](wat_array::BitArray& b) { return b.Build(); }); } - std::for_each(m_data.begin(), m_data.end(), - std::mem_fun_ref(&wat_array::BitArray::Build)); -} - -/** Return the size of the string. */ -size_t size() const -{ - assert(!m_data.empty()); - return m_data.front().length(); -} - -/** Return the number of occurrences of the specified symbol. */ -size_t count(T c) const -{ - return m_data[c].one_num(); -} - -/** Return the count of symbol c in s[0, i). */ -size_t rank(T c, size_t i) const -{ - return m_data[c].Rank(1, i); -} - -/** Return the symbol at the specified position. */ -T at(size_t i) const -{ - assert(!m_data.empty()); - assert(i < m_data.front().length()); - for (Data::const_iterator it = m_data.begin(); - it != m_data.end(); ++it) - if (it->Lookup(i)) - return it - m_data.begin(); - return std::numeric_limits::max(); -} - -/** Store this data structure. */ -friend std::ostream& operator<<(std::ostream& out, const BitArrays& o) -{ - uint32_t n = o.m_data.size(); - out.write(reinterpret_cast(&n), sizeof n); - for (Data::const_iterator it = o.m_data.begin(); - it != o.m_data.end(); ++it) - it->Save(out); - return out; -} - -/** Load this data structure. */ -friend std::istream& operator>>(std::istream& in, BitArrays& o) -{ - o.m_data.clear(); - uint32_t n = 0; - if (!in.read(reinterpret_cast(&n), sizeof n)) + /** Return the size of the string. */ + size_t size() const + { + assert(!m_data.empty()); + return m_data.front().length(); + } + + /** Return the number of occurrences of the specified symbol. */ + size_t count(T c) const { return m_data[c].one_num(); } + + /** Return the count of symbol c in s[0, i). */ + size_t rank(T c, size_t i) const { return m_data[c].Rank(1, i); } + + /** Return the symbol at the specified position. */ + T at(size_t i) const + { + assert(!m_data.empty()); + assert(i < m_data.front().length()); + for (Data::const_iterator it = m_data.begin(); it != m_data.end(); ++it) + if (it->Lookup(i)) + return it - m_data.begin(); + return std::numeric_limits::max(); + } + + /** Store this data structure. */ + friend std::ostream& operator<<(std::ostream& out, const BitArrays& o) + { + uint32_t n = o.m_data.size(); + out.write(reinterpret_cast(&n), sizeof n); + for (Data::const_iterator it = o.m_data.begin(); it != o.m_data.end(); ++it) + it->Save(out); + return out; + } + + /** Load this data structure. */ + friend std::istream& operator>>(std::istream& in, BitArrays& o) + { + o.m_data.clear(); + uint32_t n = 0; + if (!in.read(reinterpret_cast(&n), sizeof n)) + return in; + assert(n > 0); + assert(n < std::numeric_limits::max()); + o.m_data.resize(n); + for (Data::iterator it = o.m_data.begin(); it != o.m_data.end(); ++it) + it->Load(in); return in; - assert(n > 0); - assert(n < std::numeric_limits::max()); - o.m_data.resize(n); - for (Data::iterator it = o.m_data.begin(); - it != o.m_data.end(); ++it) - it->Load(in); - return in; -} + } private: typedef std::vector Data; diff -Nru abyss-2.2.3/Graph/ContigGraphAlgorithms.h abyss-2.2.4/Graph/ContigGraphAlgorithms.h --- abyss-2.2.3/Graph/ContigGraphAlgorithms.h 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Graph/ContigGraphAlgorithms.h 2020-01-30 18:48:44.000000000 +0000 @@ -7,8 +7,8 @@ #include "Common/Functional.h" #include "Common/Iterator.h" #include "Graph/ContigGraph.h" -#include #include +#include #include #include #include @@ -18,51 +18,49 @@ /** Return true if the edge e is a palindrome. */ template -struct IsPalindrome : std::unary_function< - typename graph_traits::edge_descriptor, bool> +struct IsPalindrome : std::unary_function::edge_descriptor, bool> { - IsPalindrome(const Graph& g) : m_g(g) { } - bool operator()( - typename graph_traits::edge_descriptor e) const + IsPalindrome(const Graph& g) + : m_g(g) + {} + bool operator()(typename graph_traits::edge_descriptor e) const { - return source(e, m_g) - == get(vertex_complement, m_g, target(e, m_g)); + return source(e, m_g) == get(vertex_complement, m_g, target(e, m_g)); } + private: const Graph& m_g; }; /** Return whether the outgoing edge of vertex u is contiguous. */ template -bool contiguous_out(const Graph& g, - typename graph_traits::vertex_descriptor u) +bool +contiguous_out(const Graph& g, typename graph_traits::vertex_descriptor u) { - return out_degree(u, g) == 1 - && in_degree(*adjacent_vertices(u, g).first, g) == 1; + return out_degree(u, g) == 1 && in_degree(*adjacent_vertices(u, g).first, g) == 1; } /** Return whether the incoming edge of vertex u is contiguous. */ template -bool contiguous_in(const Graph& g, - typename graph_traits::vertex_descriptor u) +bool +contiguous_in(const Graph& g, typename graph_traits::vertex_descriptor u) { return contiguous_out(g, get(vertex_complement, g, u)); } /** Add the outgoing edges of vertex u to vertex uout. */ template -void copy_out_edges(Graph &g, - typename Graph::vertex_descriptor u, - typename Graph::vertex_descriptor uout) -{ - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; - typedef typename graph_traits::out_edge_iterator - out_edge_iterator; +void +copy_out_edges( + Graph& g, + typename Graph::vertex_descriptor u, + typename Graph::vertex_descriptor uout) +{ + typedef typename graph_traits::vertex_descriptor vertex_descriptor; + typedef typename graph_traits::out_edge_iterator out_edge_iterator; typedef typename edge_property::type edge_property_type; assert(u != uout); - std::pair - edges = g.out_edges(u); + std::pair edges = g.out_edges(u); bool palindrome = false; edge_property_type palindrome_ep; for (out_edge_iterator e = edges.first; e != edges.second; ++e) { @@ -87,24 +85,20 @@ /** Add the incoming edges of vertex u to vertex v. */ template -void copy_in_edges(Graph& g, - typename Graph::vertex_descriptor u, - typename Graph::vertex_descriptor v) -{ - copy_out_edges(g, - get(vertex_complement, g, u), - get(vertex_complement, g, v)); +void +copy_in_edges(Graph& g, typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v) +{ + copy_out_edges(g, get(vertex_complement, g, u), get(vertex_complement, g, v)); } /** Assemble a path of unambigous out edges starting at vertex u. * u itself is not copied to out. */ template -OutIt extend(const Graph& g, - typename Graph::vertex_descriptor u, OutIt out) +OutIt +extend(const Graph& g, typename Graph::vertex_descriptor u, OutIt out) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; std::set seen; while (out_degree(u, g) == 1 && seen.insert(u).second) { u = *adjacent_vertices(u, g).first; @@ -116,12 +110,10 @@ /** Assemble an unambiguous path starting at vertex u. * Every edge must satisfy the predicate. */ template -OutIt assemble_if(const Graph& g, - typename Graph::vertex_descriptor u, OutIt out, - Predicate pred) +OutIt +assemble_if(const Graph& g, typename Graph::vertex_descriptor u, OutIt out, Predicate pred) { - typedef typename graph_traits::edge_descriptor - edge_descriptor; + typedef typename graph_traits::edge_descriptor edge_descriptor; while (contiguous_out(g, u)) { edge_descriptor e = *out_edges(u, g).first; if (!pred(e)) @@ -138,20 +130,21 @@ * are removed as well. */ template -void remove_vertex_if(Graph& g, It first, It last, Predicate p) +void +remove_vertex_if(Graph& g, It first, It last, Predicate p) { - for_each_if(first, last, - bind1st(std::mem_fun(&Graph::clear_vertex), &g), p); - for_each_if(first, last, - bind1st(std::mem_fun(&Graph::remove_vertex), &g), p); + for_each_if( + first, last, [&g](const ContigNode& c) { return g.clear_vertex(c); }, p); + for_each_if( + first, last, [&g](const ContigNode& c) { return g.remove_vertex(c); }, p); } /** Add the vertex and edge propeties of the path [first, last). */ template -VP addProp(const Graph& g, It first, It last, const VP*) +VP +addProp(const Graph& g, It first, It last, const VP*) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; assert(first != last); VP vp = get(vertex_bundle, g, *first); for (It it = first + 1; it != last; ++it) { @@ -164,17 +157,17 @@ } template -no_property addProp(const Graph&, It, It, const no_property*) +no_property +addProp(const Graph&, It, It, const no_property*) { return no_property(); } template -typename vertex_property::type addProp(const Graph& g, - It first, It last) +typename vertex_property::type +addProp(const Graph& g, It first, It last) { - return addProp(g, first, last, - (typename vertex_property::type*)NULL); + return addProp(g, first, last, (typename vertex_property::type*)NULL); } /** Merge the vertices in the sequence [first, last). @@ -185,8 +178,7 @@ typename graph_traits::vertex_descriptor merge(Graph& g, It first, It last) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; assert(first != last); vertex_descriptor u = add_vertex(addProp(g, first, last), g); copy_in_edges(g, *first, u); @@ -197,26 +189,24 @@ /** Assemble unambiguous paths. Write the paths to out. * Every edge must satisfy the predicate. */ template -OutIt assemble_if(Graph& g, OutIt out, Predicate pred0) +OutIt +assemble_if(Graph& g, OutIt out, Predicate pred0) { typedef typename Graph::vertex_iterator vertex_iterator; // pred(e) = !isPalindrome(e) && pred0(e) - binary_compose, - std::unary_negate >, Predicate> - pred(compose2(std::logical_and(), - std::not1(IsPalindrome(g)), pred0)); + binary_compose, std::unary_negate>, Predicate> pred( + compose2(std::logical_and(), std::not1(IsPalindrome(g)), pred0)); std::pair uit = g.vertices(); for (vertex_iterator u = uit.first; u != uit.second; ++u) { - if (!contiguous_out(g, *u) || contiguous_in(g, *u) - || !pred(*out_edges(*u, g).first)) + if (!contiguous_out(g, *u) || contiguous_in(g, *u) || !pred(*out_edges(*u, g).first)) continue; typename output_iterator_traits::value_type path; assemble_if(g, *u, back_inserter(path), pred); assert(path.size() >= 2); assert(path.front() != path.back()); merge(g, path.begin(), path.end()); - remove_vertex_if(g, path.begin(), path.end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); }); *out++ = path; } return out; @@ -224,25 +214,25 @@ /** Assemble unambiguous paths. Write the paths to out. */ template -OutIt assemble(Graph& g, OutIt out) +OutIt +assemble(Graph& g, OutIt out) { - typedef typename graph_traits::edge_descriptor - edge_descriptor; + typedef typename graph_traits::edge_descriptor edge_descriptor; return assemble_if(g, out, True()); } /** Return true if the edge e is +ve sense. */ template -struct IsPositive : std::unary_function< - typename graph_traits::edge_descriptor, bool> +struct IsPositive : std::unary_function::edge_descriptor, bool> { - IsPositive(const Graph& g) : m_g(g) { } - bool operator()( - typename graph_traits::edge_descriptor e) const + IsPositive(const Graph& g) + : m_g(g) + {} + bool operator()(typename graph_traits::edge_descriptor e) const { - return !get(vertex_sense, m_g, source(e, m_g)) - && !get(vertex_sense, m_g, target(e, m_g)); + return !get(vertex_sense, m_g, source(e, m_g)) && !get(vertex_sense, m_g, target(e, m_g)); } + private: const Graph& m_g; }; @@ -250,7 +240,8 @@ /** Assemble unambiguous paths in forward orientation only. * Write the paths to out. */ template -OutIt assemble_stranded(Graph& g, OutIt out) +OutIt +assemble_stranded(Graph& g, OutIt out) { return assemble_if(g, out, IsPositive(g)); } @@ -260,8 +251,9 @@ * deg+(v) = 0, and p(v) is true. * Stores all removed vertices in result. */ -template -OutputIt pruneTips_if(Graph& g, OutputIt result, Pred p) +template +OutputIt +pruneTips_if(Graph& g, OutputIt result, Pred p) { typedef typename graph_traits::adjacency_iterator Vit; typedef typename graph_traits::vertex_iterator Uit; @@ -277,7 +269,7 @@ std::pair vrange = adjacent_vertices(u, g); for (Vit vit = vrange.first; vit != vrange.second; ++vit) { V v = *vit; - //assert(v != u); + // assert(v != u); if (out_degree(v, g) == 0 && p(v)) tips.push_back(v); } @@ -285,23 +277,23 @@ /** Remove the tips. */ remove_vertex_if(g, tips.begin(), tips.end(), True()); - std::transform(tips.begin(), tips.end(), result, - std::mem_fun_ref(&ContigNode::contigIndex)); - + std::transform( + tips.begin(), tips.end(), result, [](const ContigNode& c) { return c.contigIndex(); }); return result; } /** Return true if the vertex is a normal 1-in 0-out tip. */ template -struct IsTip : std::unary_function< - typename graph_traits::vertex_descriptor, bool> +struct IsTip : std::unary_function::vertex_descriptor, bool> { - IsTip(const Graph& g) : m_g(g) { } - bool operator()( - typename graph_traits::vertex_descriptor v) const + IsTip(const Graph& g) + : m_g(g) + {} + bool operator()(typename graph_traits::vertex_descriptor v) const { return in_degree(v, m_g) == 1; } + private: const Graph& m_g; }; @@ -311,8 +303,9 @@ * and deg-(v) = 1 and deg+(v) = 0. * Stores all removed vertices in result. */ -template -OutputIt pruneTips(Graph& g, OutputIt result) +template +OutputIt +pruneTips(Graph& g, OutputIt result) { return pruneTips_if(g, result, IsTip(g)); } @@ -322,8 +315,9 @@ * true. * Stores all removed vertices in result. */ -template -OutputIt removeIslands_if(Graph& g, OutputIt result, Pred p) +template +OutputIt +removeIslands_if(Graph& g, OutputIt result, Pred p) { typedef typename graph_traits::vertex_iterator Uit; typedef typename graph_traits::vertex_descriptor V; @@ -344,8 +338,9 @@ } /** Add missing complementary edges. */ -template -size_t addComplementaryEdges(ContigGraph& g) +template +size_t +addComplementaryEdges(ContigGraph& g) { typedef ContigGraph Graph; typedef graph_traits GTraits; diff -Nru abyss-2.2.3/KAligner/Aligner.h abyss-2.2.4/KAligner/Aligner.h --- abyss-2.2.3/KAligner/Aligner.h 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/KAligner/Aligner.h 2020-01-30 18:48:44.000000000 +0000 @@ -1,13 +1,13 @@ #ifndef ALIGNER_H #define ALIGNER_H 1 -#include "config.h" -#include "KAligner/Options.h" #include "Alignment.h" #include "ConstString.h" #include "Functional.h" +#include "KAligner/Options.h" #include "Kmer.h" #include "UnorderedMap.h" +#include "config.h" #include #include #include // for strcpy @@ -27,131 +27,124 @@ { uint32_t contig; uint32_t pos; // 0 indexed - Position(uint32_t contig = std::numeric_limits::max(), - uint32_t pos = std::numeric_limits::max()) - : contig(contig), pos(pos) { } + Position( + uint32_t contig = std::numeric_limits::max(), + uint32_t pos = std::numeric_limits::max()) + : contig(contig) + , pos(pos) + {} /** Mark this seed as a duplicate. */ - void setDuplicate(const char* thisContig, const char* otherContig, - const Sequence& kmer) + void setDuplicate(const char* thisContig, const char* otherContig, const Sequence& kmer) { if (opt::multimap == opt::IGNORE) contig = std::numeric_limits::max(); else { - std::cerr << "error: duplicate k-mer in " - << thisContig - << " also in " - << otherContig - << ": " << kmer << '\n'; + std::cerr << "error: duplicate k-mer in " << thisContig << " also in " << otherContig + << ": " << kmer << '\n'; exit(EXIT_FAILURE); } } /** Return whether this seed is a duplciate. */ - bool isDuplicate() const - { - return contig == std::numeric_limits::max(); - } + bool isDuplicate() const { return contig == std::numeric_limits::max(); } }; -typedef unordered_multimap > - SeqPosHashMultiMap; +typedef unordered_multimap> SeqPosHashMultiMap; #if HAVE_GOOGLE_SPARSE_HASH_MAP -# include -typedef google::sparse_hash_map > - SeqPosHashUniqueMap; +#include +typedef google::sparse_hash_map> SeqPosHashUniqueMap; #else -typedef unordered_map > - SeqPosHashUniqueMap; +typedef unordered_map> SeqPosHashUniqueMap; #endif - typedef std::vector AlignmentVector; /** * Index a target sequence and align query sequences to that indexed * target. */ -template +template class Aligner { - public: - typedef typename SeqPosHashMap::iterator map_iterator; - typedef typename SeqPosHashMap::const_iterator - map_const_iterator; - - Aligner(int hashSize, size_t buckets) - : m_hashSize(hashSize), m_target(buckets) { } - - Aligner(int hashSize, size_t buckets, float factor) - : m_hashSize(hashSize) - { - m_target.max_load_factor(factor); - m_target.rehash(buckets); - } + public: + typedef typename SeqPosHashMap::iterator map_iterator; + typedef typename SeqPosHashMap::const_iterator map_const_iterator; + + Aligner(int hashSize, size_t buckets) + : m_hashSize(hashSize) + , m_target(buckets) + {} - void addReferenceSequence(const StringID& id, - const Sequence& seq); - void addReferenceSequence(const Kmer& kmer, Position pos); - - template - void alignRead(const std::string& qid, const Sequence& seq, - oiterator dest); - - size_t size() const { return m_target.size(); } - size_t bucket_count() const - { - return m_target.bucket_count(); - } + Aligner(int hashSize, size_t buckets, float factor) + : m_hashSize(hashSize) + { + m_target.max_load_factor(factor); + m_target.rehash(buckets); + } - /** Return the number of duplicate k-mer in the target. */ - size_t countDuplicates() const - { - assert(opt::multimap == opt::IGNORE); - return count_if(m_target.begin(), m_target.end(), - compose1(std::mem_fun_ref(&Position::isDuplicate), - mem_var(&SeqPosHashMap::value_type::second))); - } + void addReferenceSequence(const StringID& id, const Sequence& seq); + void addReferenceSequence(const Kmer& kmer, Position pos); - private: - explicit Aligner(const Aligner&); + template + void alignRead(const std::string& qid, const Sequence& seq, oiterator dest); - typedef std::map AlignmentSet; + size_t size() const { return m_target.size(); } + size_t bucket_count() const { return m_target.bucket_count(); } - void alignKmer( - AlignmentSet& aligns, const Sequence& kmer, - bool isRC, bool good, int read_ind, int seqLen); + /** Return the number of duplicate k-mer in the target. */ + size_t countDuplicates() const + { + assert(opt::multimap == opt::IGNORE); + return count_if( + m_target.begin(), m_target.end(), [](const std::pair& s) { + return s.second.isDuplicate(); + }); + } - AlignmentSet getAlignmentsInternal( - const Sequence& seq, bool isRC); + private: + explicit Aligner(const Aligner&); - template - void coalesceAlignments( - const std::string& qid, const std::string& seq, - const AlignmentSet& alignSet, - oiterator& dest); + typedef std::map AlignmentSet; - // The number of bases to hash on - int m_hashSize; + void alignKmer( + AlignmentSet& aligns, + const Sequence& kmer, + bool isRC, + bool good, + int read_ind, + int seqLen); + + AlignmentSet getAlignmentsInternal(const Sequence& seq, bool isRC); + + template + void coalesceAlignments( + const std::string& qid, + const std::string& seq, + const AlignmentSet& alignSet, + oiterator& dest); + + // The number of bases to hash on + int m_hashSize; - /** A map of k-mer to contig coordinates. */ - SeqPosHashMap m_target; + /** A map of k-mer to contig coordinates. */ + SeqPosHashMap m_target; - /** A dictionary of contig IDs. */ - std::vector m_dict; + /** A dictionary of contig IDs. */ + std::vector m_dict; - unsigned contigIDToIndex(const std::string& id) - { - m_dict.push_back(id); - return m_dict.size() - 1; - } + unsigned contigIDToIndex(const std::string& id) + { + m_dict.push_back(id); + return m_dict.size() - 1; + } - cstring contigIndexToID(unsigned index) - { - assert(index < m_dict.size()); - return m_dict[index]; - } + cstring contigIndexToID(unsigned index) + { + assert(index < m_dict.size()); + return m_dict[index]; + } }; #endif diff -Nru abyss-2.2.3/KAligner/PipeMux.h abyss-2.2.4/KAligner/PipeMux.h --- abyss-2.2.3/KAligner/PipeMux.h 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/KAligner/PipeMux.h 2020-01-30 18:48:44.000000000 +0000 @@ -2,16 +2,19 @@ #define PIPEMUX_H 1 #include "Pipe.h" -#include +#include "Semaphore.h" #include #include -template -class PipeMux { +template +class PipeMux +{ public: - /** Default constructor. */ - PipeMux(size_t pipe_size = 1) - : m_index(0), m_entry_num(0), m_pipe_size(pipe_size) + /** Default constructor. */ + PipeMux(size_t pipe_size = 1) + : m_index(0) + , m_entry_num(0) + , m_pipe_size(pipe_size) { pthread_rwlock_init(&m_rwlock_vecs, NULL); pthread_mutex_init(&m_mutex_index, NULL); @@ -94,7 +97,8 @@ } /** Checks that the PipeMux is empty. */ - bool empty() { + bool empty() + { pthread_rwlock_rdlock(&m_rwlock_vecs); bool isEmpty = m_pipes.empty(); pthread_rwlock_unlock(&m_rwlock_vecs); @@ -128,11 +132,11 @@ } assert(i < m_pipes.size()); delete m_pipes[i]; - m_pipes.erase(m_pipes.begin()+i); + m_pipes.erase(m_pipes.begin() + i); assert(i < m_mutex_pipes.size()); pthread_mutex_destroy(m_mutex_pipes[i]); delete m_mutex_pipes[i]; - m_mutex_pipes.erase(m_mutex_pipes.begin()+i); + m_mutex_pipes.erase(m_mutex_pipes.begin() + i); // Make sure the index points to the next element. pthread_mutex_lock(&m_mutex_index); m_index = m_index == m_pipes.size() ? 0 : m_index; @@ -140,7 +144,6 @@ pthread_rwlock_unlock(&m_rwlock_vecs); } - }; #endif diff -Nru abyss-2.2.3/Layout/layout.cc abyss-2.2.4/Layout/layout.cc --- abyss-2.2.3/Layout/layout.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Layout/layout.cc 2020-01-30 18:48:44.000000000 +0000 @@ -1,7 +1,5 @@ -#include "config.h" #include "ContigPath.h" #include "ContigProperties.h" -#include "Uncompress.h" #include "Graph/Assemble.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" @@ -9,6 +7,8 @@ #include "Graph/GraphAlgorithms.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" +#include "Uncompress.h" +#include "config.h" #include #include #include @@ -23,92 +23,94 @@ #define PROGRAM "abyss-layout" -static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2012 Shaun Jackman\n"; +static const char VERSION_MESSAGE[] = PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2012 Shaun Jackman\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " [OPTION]... OVERLAP\n" -"Layout contigs using the sequence overlap graph.\n" -"Output sequence paths.\n" -"\n" -" Arguments:\n" -"\n" -" OVERLAP the sequence overlap graph\n" -"\n" -" Options:\n" -"\n" -" -s, --min-length=N minimum sequence length [0]\n" -" -m, --min-overlap=N minimum overlap [0]\n" -" -k, --kmer=N length of a k-mer\n" -" -o, --out=FILE write the paths to FILE\n" -" -g, --graph=FILE write the graph to FILE\n" -" --tred remove transitive edges\n" -" --no-tred do not remove transitive edges [default]\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " [OPTION]... OVERLAP\n" + "Layout contigs using the sequence overlap graph.\n" + "Output sequence paths.\n" + "\n" + " Arguments:\n" + "\n" + " OVERLAP the sequence overlap graph\n" + "\n" + " Options:\n" + "\n" + " -s, --min-length=N minimum sequence length [0]\n" + " -m, --min-overlap=N minimum overlap [0]\n" + " -k, --kmer=N length of a k-mer\n" + " -o, --out=FILE write the paths to FILE\n" + " -g, --graph=FILE write the graph to FILE\n" + " --tred remove transitive edges\n" + " --no-tred do not remove transitive edges [default]\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Minimum sequence length. */ - static unsigned minLength; +/** Minimum sequence length. */ +static unsigned minLength; - /** Minimum overlap. */ - static unsigned minOverlap; +/** Minimum overlap. */ +static unsigned minOverlap; - /** Write the paths to this file. */ - static string out; +/** Write the paths to this file. */ +static string out; - /** Write the graph to this file. */ - static string graphPath; +/** Write the graph to this file. */ +static string graphPath; - /** Remove transitive edges. */ - static int tred; +/** Remove transitive edges. */ +static int tred; - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Verbose output. */ - int verbose; // used by PopBubbles +/** Verbose output. */ +int verbose; // used by PopBubbles - /** Output format */ - int format = DOT; +/** Output format */ +int format = DOT; } static const char shortopts[] = "g:k:m:o:s:v"; -enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "graph", required_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, - { "min-overlap", required_argument, NULL, 'm' }, - { "out", required_argument, NULL, 'o' }, - { "min-length", required_argument, NULL, 's' }, - { "tred", no_argument, &opt::tred, true }, - { "no-tred", no_argument, &opt::tred, false }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION }; +static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, + { "min-overlap", required_argument, NULL, 'm' }, + { "out", required_argument, NULL, 'o' }, + { "min-length", required_argument, NULL, 's' }, + { "tred", no_argument, &opt::tred, true }, + { "no-tred", no_argument, &opt::tred, false }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { NULL, 0, NULL, 0 } }; + /** An overlap graph. */ typedef DirectedGraph DG; typedef ContigGraph Graph; /** Remove short vertices. */ -static void filterVertices(Graph& g, unsigned minLength) +static void +filterVertices(Graph& g, unsigned minLength) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -131,15 +133,17 @@ } if (opt::verbose > 0) { - cerr << "Ignored " << numRemoved << " sequences shorter than " - << minLength << " bp.\n"; + cerr << "Ignored " << numRemoved << " sequences shorter than " << minLength << " bp.\n"; printGraphStats(cerr, g); } } /** Return true if the edge is a small overlap. */ -struct IsSmallOverlap { - IsSmallOverlap(Graph& g) : m_g(g) { } +struct IsSmallOverlap +{ + IsSmallOverlap(Graph& g) + : m_g(g) + {} bool operator()(graph_traits::edge_descriptor e) const { int maxDistance = -opt::minOverlap; @@ -149,7 +153,8 @@ }; /** Remove small overlaps. */ -static void filterEdges(Graph& g, unsigned minOverlap) +static void +filterEdges(Graph& g, unsigned minOverlap) { if (minOverlap == 0) return; @@ -163,7 +168,8 @@ } /** Read a graph from the specified file. */ -static void readGraph(const string& path, Graph& g) +static void +readGraph(const string& path, Graph& g) { if (opt::verbose > 0) cerr << "Reading `" << path << "'...\n"; @@ -178,7 +184,8 @@ } /** Return the length histogram. */ -static Histogram buildLengthHistogram(const Graph& g) +static Histogram +buildLengthHistogram(const Graph& g) { typedef graph_traits::vertex_descriptor V; typedef graph_traits::vertex_iterator Vit; @@ -193,44 +200,43 @@ } /** Run abyss-layout. */ -int main(int argc, char** argv) +int +main(int argc, char** argv) { bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': + case '?': die = true; break; - case 'k': + case 'k': arg >> opt::k; break; - case 'g': + case 'g': arg >> opt::graphPath; break; - case 'm': + case 'm': arg >> opt::minOverlap; break; - case 'o': + case 'o': arg >> opt::out; break; - case 's': + case 's': arg >> opt::minLength; break; - case 'v': + case 'v': opt::verbose++; break; - case OPT_HELP: + case OPT_HELP: cout << USAGE_MESSAGE; exit(EXIT_SUCCESS); - case OPT_VERSION: + case OPT_VERSION: cout << VERSION_MESSAGE; exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -241,8 +247,7 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -263,8 +268,7 @@ if (opt::tred) { unsigned numTransitive = remove_transitive_edges(g); if (opt::verbose > 0) { - cerr << "Removed " << numTransitive - << " transitive edges.\n"; + cerr << "Removed " << numTransitive << " transitive edges.\n"; printGraphStats(cerr, g); } } @@ -278,11 +282,9 @@ sort(paths.begin(), paths.end()); if (opt::verbose > 0) { unsigned n = 0; - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) n += it->size(); - cerr << "Assembled " << n << " sequences in " - << paths.size() << " contigs.\n"; + cerr << "Assembled " << n << " sequences in " << paths.size() << " contigs.\n"; printGraphStats(cerr, g); } @@ -291,18 +293,16 @@ ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout; assert_good(out, opt::out); g_contigNames.unlock(); - for (vector::const_iterator it = paths.begin(); - it != paths.end(); ++it) + for (vector::const_iterator it = paths.begin(); it != paths.end(); ++it) out << createContigName() << '\t' << *it << '\n'; assert_good(out, opt::out); // Create the new vertices. - for (vector::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (vector::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; merge(g, path.begin(), path.end()); - remove_vertex_if(g, path.begin(), path.end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); }); } if (opt::verbose > 0) printGraphStats(cerr, g); diff -Nru abyss-2.2.3/Makefile.am abyss-2.2.4/Makefile.am --- abyss-2.2.3/Makefile.am 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Makefile.am 2020-01-30 18:48:44.000000000 +0000 @@ -69,8 +69,16 @@ clang-format: for i in Bloom/RollingBloomDBGVisitor.h Bloom/bloom.cc BloomDBG/BloomIO.h \ BloomDBG/Checkpoint.h BloomDBG/HashAgnosticCascadingBloom.h BloomDBG/bloom-dbg.* \ + ABYSS/abyss.cc Assembly/BranchGroup.h FMIndex/BitArrays.h FilterGraph/FilterGraph.cc \ + Graph/ContigGraphAlgorithms.h KAligner/Aligner.h KAligner/PipeMux.h Layout/layout.cc \ + MergePaths/MergeContigs.cpp MergePaths/MergePaths.cpp ParseAligns/ParseAligns.cpp \ + ParseAligns/abyss-fixmate.cc PathOverlap/PathOverlap.cpp PopBubbles/PopBubbles.cpp Scaffold/scaffold.cc \ Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp; do clang-format -style=file $$i >$$i.fixed; done for i in Bloom/RollingBloomDBGVisitor.h Bloom/bloom.cc BloomDBG/BloomIO.h \ BloomDBG/Checkpoint.h BloomDBG/HashAgnosticCascadingBloom.h BloomDBG/bloom-dbg.* \ + ABYSS/abyss.cc Assembly/BranchGroup.h FMIndex/BitArrays.h FilterGraph/FilterGraph.cc \ + Graph/ContigGraphAlgorithms.h KAligner/Aligner.h KAligner/PipeMux.h Layout/layout.cc \ + MergePaths/MergeContigs.cpp MergePaths/MergePaths.cpp ParseAligns/ParseAligns.cpp \ + ParseAligns/abyss-fixmate.cc PathOverlap/PathOverlap.cpp PopBubbles/PopBubbles.cpp Scaffold/scaffold.cc \ Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp; do diff -su $$i $$i.fixed && rm -f $$i.fixed; done if ls *.fixed; then exit 1; fi diff -Nru abyss-2.2.3/MergePaths/MergeContigs.cpp abyss-2.2.4/MergePaths/MergeContigs.cpp --- abyss-2.2.3/MergePaths/MergeContigs.cpp 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/MergePaths/MergeContigs.cpp 2020-01-30 18:48:44.000000000 +0000 @@ -1,24 +1,26 @@ -#include "config.h" #include "Common/Options.h" #include "ContigNode.h" #include "ContigPath.h" #include "ContigProperties.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "DataLayer/Options.h" #include "Dictionary.h" #include "FastaReader.h" -#include "Histogram.h" -#include "IOUtil.h" -#include "MemoryUtil.h" -#include "smith_waterman.h" -#include "Sequence.h" -#include "StringUtil.h" -#include "Uncompress.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" #include "Graph/Options.h" +#include "Histogram.h" +#include "IOUtil.h" +#include "MemoryUtil.h" +#include "Sequence.h" +#include "StringUtil.h" +#include "Uncompress.h" +#include "config.h" +#include "smith_waterman.h" #include #include #include @@ -26,8 +28,6 @@ #include #include #include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; @@ -36,104 +36,116 @@ DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k -o [OPTION]... FASTA [OVERLAP] PATH\n" -"Merge paths of contigs to create larger contigs.\n" -"\n" -" Arguments:\n" -"\n" -" FASTA contigs in FASTA format\n" -" OVERLAP contig overlap graph\n" -" PATH sequences of contig IDs\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=KMER_SIZE k-mer size\n" -" -o, --out=FILE output the merged contigs to FILE [stdout]\n" -" -g, --graph=FILE write the contig overlap graph to FILE\n" -" --merged output only merged contigs\n" -" --adj output the graph in adj format\n" -" --dot output the graph in dot format [default]\n" -" --dot-meancov same as above but give the mean coverage\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for database\n" -" --strain=NAME specify strain NAME for database\n" -" --species=NAME specify species NAME for database\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k -o [OPTION]... FASTA [OVERLAP] PATH\n" + "Merge paths of contigs to create larger contigs.\n" + "\n" + " Arguments:\n" + "\n" + " FASTA contigs in FASTA format\n" + " OVERLAP contig overlap graph\n" + " PATH sequences of contig IDs\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=KMER_SIZE k-mer size\n" + " -o, --out=FILE output the merged contigs to FILE [stdout]\n" + " -g, --graph=FILE write the contig overlap graph to FILE\n" + " --merged output only merged contigs\n" + " --adj output the graph in adj format\n" + " --dot output the graph in dot format [default]\n" + " --dot-meancov same as above but give the mean coverage\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for database\n" + " --strain=NAME specify strain NAME for database\n" + " --species=NAME specify species NAME for database\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - unsigned k; // used by ContigProperties - unsigned pathCount; // num of initial paths +string db; +dbVars metaVars; +unsigned k; // used by ContigProperties +unsigned pathCount; // num of initial paths - /** Output FASTA path. */ - static string out = "-"; +/** Output FASTA path. */ +static string out = "-"; - /** Output graph path. */ - static string graphPath; +/** Output graph path. */ +static string graphPath; - /** Output graph format. */ - int format = DOT; +/** Output graph format. */ +int format = DOT; - /** Output only merged contigs. */ - int onlyMerged; +/** Output only merged contigs. */ +int onlyMerged; - /** Minimum overlap. */ - static unsigned minOverlap = 20; +/** Minimum overlap. */ +static unsigned minOverlap = 20; - /** Minimum alignment identity. */ - static float minIdentity = 0.9; +/** Minimum alignment identity. */ +static float minIdentity = 0.9; } static const char shortopts[] = "g:k:o:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; -//enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "adj", no_argument, &opt::format, ADJ }, - { "dot", no_argument, &opt::format, DOT }, - { "dot-meancov", no_argument, &opt::format, DOT_MEANCOV }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "graph", required_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, - { "merged", no_argument, &opt::onlyMerged, 1 }, - { "out", required_argument, NULL, 'o' }, - { "path", required_argument, NULL, 'p' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; +// enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { { "adj", no_argument, &opt::format, ADJ }, + { "dot", no_argument, &opt::format, DOT }, + { "dot-meancov", no_argument, &opt::format, DOT_MEANCOV }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "graph", required_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, + { "merged", no_argument, &opt::onlyMerged, 1 }, + { "out", required_argument, NULL, 'o' }, + { "path", required_argument, NULL, 'p' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; /* A contig sequence. */ -struct Contig { +struct Contig +{ Contig(const string& comment, const string& seq) - : comment(comment), seq(seq) { } - Contig(const FastaRecord& o) : comment(o.comment), seq(o.seq) { } + : comment(comment) + , seq(seq) + {} + Contig(const FastaRecord& o) + : comment(o.comment) + , seq(o.seq) + {} string comment; string seq; }; @@ -144,7 +156,8 @@ /** Return the sequence of the specified contig node. The sequence * may be ambiguous or reverse complemented. */ -static Sequence sequence(const Contigs& contigs, const ContigNode& id) +static Sequence +sequence(const Contigs& contigs, const ContigNode& id) { if (id.ambiguous()) { string s(id.ambiguousSequence()); @@ -160,22 +173,22 @@ /** Return a consensus sequence of a and b. * @return an empty string if a consensus could not be found */ -static string createConsensus(const Sequence& a, const Sequence& b) +static string +createConsensus(const Sequence& a, const Sequence& b) { assert(a.length() == b.length()); if (a == b) return a; string s; s.reserve(a.length()); - for (string::const_iterator ita = a.begin(), itb = b.begin(); - ita != a.end(); ++ita, ++itb) { + for (string::const_iterator ita = a.begin(), itb = b.begin(); ita != a.end(); ++ita, ++itb) { bool mask = islower(*ita) || islower(*itb); char ca = toupper(*ita), cb = toupper(*itb); - char c = ca == cb ? ca - : ca == 'N' ? cb - : cb == 'N' ? ca - : ambiguityIsSubset(ca, cb) ? ambiguityOr(ca, cb) - : 'x'; + char c = ca == cb + ? ca + : ca == 'N' + ? cb + : cb == 'N' ? ca : ambiguityIsSubset(ca, cb) ? ambiguityOr(ca, cb) : 'x'; if (c == 'x') return string(""); s += mask ? tolower(c) : c; @@ -183,25 +196,28 @@ return s; } -typedef ContigGraph > Graph; +typedef ContigGraph> Graph; typedef graph_traits::vertex_descriptor vertex_descriptor; /** Return the properties of the specified vertex, unless u is * ambiguous, in which case return the length of the ambiguous * sequence. */ -static inline -ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u) +static inline ContigProperties +get(vertex_bundle_t, const Graph& g, ContigNode u) { - return u.ambiguous() - ? ContigProperties(u.length() + opt::k - 1, 0) - : g[u]; + return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u]; } /** Append the sequence of contig v to seq. */ -static void mergeContigs(const Graph& g, const Contigs& contigs, - vertex_descriptor u, vertex_descriptor v, - Sequence& seq, const ContigPath& path) +static void +mergeContigs( + const Graph& g, + const Contigs& contigs, + vertex_descriptor u, + vertex_descriptor v, + Sequence& seq, + const ContigPath& path) { int d = get(edge_bundle, g, u, v).distance; assert(d < 0); @@ -235,14 +251,13 @@ unsigned matches = o.overlap_match; const string& consensus = o.overlap_str; float identity = (float)matches / consensus.size(); - good = matches >= opt::minOverlap - && identity >= opt::minIdentity; + good = matches >= opt::minOverlap && identity >= opt::minIdentity; if (opt::verbose > 2) - cerr << matches << " / " << consensus.size() - << " = " << identity - << (matches < opt::minOverlap ? " (too few)" - : identity < opt::minIdentity ? " (too low)" - : " (good)") << '\n'; + cerr << matches << " / " << consensus.size() << " = " << identity + << (matches < opt::minOverlap + ? " (too few)" + : identity < opt::minIdentity ? " (too low)" : " (good)") + << '\n'; } if (good) { assert(overlaps.size() == 1); @@ -252,16 +267,18 @@ seq += Sequence(s, o.overlap_h_pos + 1); } else { cerr << "warning: the head of " << get(vertex_name, g, v) - << " does not match the tail of the previous contig\n" - << ao << '\n' << bo << '\n' << path << endl; + << " does not match the tail of the previous contig\n" + << ao << '\n' + << bo << '\n' + << path << endl; seq += 'n'; seq += s; } } /** Return a FASTA comment for the specified path. */ -static void pathToComment(ostream& out, - const Graph& g, const ContigPath& path) +static void +pathToComment(ostream& out, const Graph& g, const ContigPath& path) { out << get(vertex_name, g, path.front()); if (path.size() == 1) @@ -274,20 +291,19 @@ } /** Merge the specified path. */ -static Contig mergePath(const Graph& g, const Contigs& contigs, - const ContigPath& path) +static Contig +mergePath(const Graph& g, const Contigs& contigs, const ContigPath& path) { Sequence seq; unsigned coverage = 0; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { if (!it->ambiguous()) coverage += g[*it].coverage; if (seq.empty()) { seq = sequence(contigs, *it); } else { assert(it != path.begin()); - mergeContigs(g, contigs, *(it-1), *it, seq, path); + mergeContigs(g, contigs, *(it - 1), *it, seq, path); } } ostringstream ss; @@ -302,8 +318,8 @@ /** Read contig paths from the specified file. * @param ids [out] the string ID of the paths */ -static ContigPaths readPaths(const string& inPath, - vector* ids = NULL) +static ContigPaths +readPaths(const string& inPath, vector* ids = NULL) { if (ids != NULL) assert(ids->empty()); @@ -325,13 +341,16 @@ ++count; if (opt::verbose > 1 && count % 1000000 == 0) - cerr << "Read " << count << " paths. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << count + << " paths. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; } if (opt::verbose > 0) - cerr << "Read " << count << " paths. " - "Using " << toSI(getMemoryUsage()) << "B of memory.\n"; + cerr << "Read " << count + << " paths. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; if (!opt::db.empty()) addToDb(db, "Init_paths", count); opt::pathCount = count; @@ -341,12 +360,11 @@ /** Finds all contigs used in each path in paths, and * marks them as seen in the vector seen. */ -static void seenContigs(vector& seen, const ContigPaths& paths) +static void +seenContigs(vector& seen, const ContigPaths& paths) { - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) - for (ContigPath::const_iterator itc = it->begin(); - itc != it->end(); ++itc) + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) + for (ContigPath::const_iterator itc = it->begin(); itc != it->end(); ++itc) if (itc->id() < seen.size()) seen[itc->id()] = true; } @@ -354,14 +372,12 @@ /** Mark contigs for removal. An empty path indicates that a contig * should be removed. */ -static void markRemovedContigs(vector& marked, - const vector& pathIDs, const ContigPaths& paths) +static void +markRemovedContigs(vector& marked, const vector& pathIDs, const ContigPaths& paths) { - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) { - size_t i = get(g_contigNames, - pathIDs[it - paths.begin()]); + size_t i = get(g_contigNames, pathIDs[it - paths.begin()]); assert(i < marked.size()); marked[i] = true; } @@ -369,16 +385,18 @@ } /** Output the updated overlap graph. */ -static void outputGraph(Graph& g, - const vector& pathIDs, const ContigPaths& paths, - const string& commandLine) +static void +outputGraph( + Graph& g, + const vector& pathIDs, + const ContigPaths& paths, + const string& commandLine) { typedef graph_traits::vertex_descriptor V; // Add the path vertices. g_contigNames.unlock(); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (!path.empty()) { @@ -389,15 +407,14 @@ g_contigNames.lock(); // Remove the vertices that are used in paths. - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (path.empty()) { remove_vertex(find_vertex(id, false, g), g); } else { - remove_vertex_if(g, path.begin(), path.end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); }); } } @@ -414,7 +431,8 @@ printGraphStats(cerr, g); } -int main(int argc, char** argv) +int +main(int argc, char** argv) { opt::trimMasked = false; @@ -422,7 +440,7 @@ { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } @@ -431,33 +449,45 @@ opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'g': arg >> opt::graphPath; break; - case 'k': arg >> opt::k; break; - case 'o': arg >> opt::out; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; break; + case '?': + die = true; + break; + case 'g': + arg >> opt::graphPath; + break; + case 'k': + arg >> opt::k; + break; + case 'o': + arg >> opt::out; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -468,7 +498,8 @@ } if (opt::out.empty()) { - cerr << PROGRAM ": " << "missing -o,--out option\n"; + cerr << PROGRAM ": " + << "missing -o,--out option\n"; die = true; } @@ -483,19 +514,12 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars - ); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); addToDb(db, "K", opt::k); } @@ -513,9 +537,10 @@ fin >> g; assert(fin.eof()); if (opt::verbose > 0) - cerr << "Read " << num_vertices(g) << " vertices. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << num_vertices(g) + << " vertices. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; if (!opt::db.empty()) { addToDb(db, "Init_vertices", num_vertices(g)); addToDb(db, "Init_edges", num_edges(g)); @@ -531,12 +556,11 @@ unsigned count = 0; FastaReader in(contigFile, FastaReader::NO_FOLD_CASE); for (FastaRecord rec; in >> rec;) { - if (!adjPath.empty() - && g_contigNames.count(rec.id) == 0) + if (!adjPath.empty() && g_contigNames.count(rec.id) == 0) continue; if (adjPath.empty()) { - graph_traits::vertex_descriptor - u = add_vertex(ContigProperties(rec.seq.length(), 0), g); + graph_traits::vertex_descriptor u = + add_vertex(ContigProperties(rec.seq.length(), 0), g); put(vertex_name, g, u, rec.id); } assert(get(g_contigNames, rec.id) == contigs.size()); @@ -544,14 +568,16 @@ ++count; if (opt::verbose > 1 && count % 1000000 == 0) - cerr << "Read " << count << " sequences. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << count + << " sequences. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; } if (opt::verbose > 0) - cerr << "Read " << count << " sequences. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << count + << " sequences. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; if (!opt::db.empty()) addToDb(db, "Init_seq", count); assert(in.eof()); @@ -571,12 +597,10 @@ // Output those contigs that were not seen in a path. Histogram lengthHistogram; ofstream fout; - ostream& out = opt::out == "-" ? cout - : (fout.open(opt::out.c_str()), fout); + ostream& out = opt::out == "-" ? cout : (fout.open(opt::out.c_str()), fout); assert_good(out, opt::out); if (!opt::onlyMerged) { - for (Contigs::const_iterator it = contigs.begin(); - it != contigs.end(); ++it) { + for (Contigs::const_iterator it = contigs.begin(); it != contigs.end(); ++it) { ContigID id(it - contigs.begin()); if (!seen[id]) { const Contig& contig = *it; @@ -585,29 +609,23 @@ out << ' ' << contig.comment; out << '\n' << contig.seq << '\n'; if (opt::verbose > 0) - lengthHistogram.insert( - count_if(contig.seq.begin(), contig.seq.end(), - isACGT)); + lengthHistogram.insert(count_if(contig.seq.begin(), contig.seq.end(), isACGT)); } } } unsigned npaths = 0; - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; if (path.empty()) continue; Contig contig = mergePath(g, contigs, path); - out << '>' << pathIDs[it - paths.begin()] - << ' ' << contig.comment << '\n' - << contig.seq << '\n'; + out << '>' << pathIDs[it - paths.begin()] << ' ' << contig.comment << '\n' + << contig.seq << '\n'; assert_good(out, opt::out); npaths++; if (opt::verbose > 0) - lengthHistogram.insert( - count_if(contig.seq.begin(), contig.seq.end(), - isACGT)); + lengthHistogram.insert(count_if(contig.seq.begin(), contig.seq.end(), isACGT)); } if (!opt::graphPath.empty()) @@ -617,7 +635,7 @@ return 0; float minCov = numeric_limits::infinity(), - minCovUsed = numeric_limits::infinity(); + minCovUsed = numeric_limits::infinity(); for (unsigned i = 0; i < contigs.size(); i++) { ContigProperties vp = g[ContigNode(i, false)]; if (vp.coverage == 0 || vp.length < opt::k) @@ -628,18 +646,16 @@ minCovUsed = min(minCovUsed, cov); } - cerr << "The minimum coverage of single-end contigs is " - << minCov << ".\n" - << "The minimum coverage of merged contigs is " - << minCovUsed << ".\n"; + cerr << "The minimum coverage of single-end contigs is " << minCov << ".\n" + << "The minimum coverage of merged contigs is " << minCovUsed << ".\n"; if (minCov < minCovUsed) cerr << "Consider increasing the coverage threshold " - "parameter, c, to " << minCovUsed << ".\n"; + "parameter, c, to " + << minCovUsed << ".\n"; if (opt::verbose > 0) { const unsigned STATS_MIN_LENGTH = 200; // bp - printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH) - << '\t' << opt::out << '\n'; + printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH) << '\t' << opt::out << '\n'; } return 0; } diff -Nru abyss-2.2.3/MergePaths/MergePaths.cpp abyss-2.2.4/MergePaths/MergePaths.cpp --- abyss-2.2.3/MergePaths/MergePaths.cpp 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/MergePaths/MergePaths.cpp 2020-01-30 18:48:44.000000000 +0000 @@ -1,18 +1,18 @@ -#include "config.h" #include "Common/Options.h" #include "ContigID.h" #include "ContigPath.h" #include "Functional.h" // for mem_var -#include "IOUtil.h" -#include "Uncompress.h" #include "Graph/Assemble.h" #include "Graph/ContigGraph.h" #include "Graph/DirectedGraph.h" #include "Graph/DotIO.h" #include "Graph/GraphAlgorithms.h" #include "Graph/GraphUtil.h" -#include +#include "IOUtil.h" +#include "Uncompress.h" +#include "config.h" #include +#include #include #include // for UINT_MAX #include @@ -27,10 +27,10 @@ #include #include #if _OPENMP -# include +#include #endif -#include "DataBase/Options.h" #include "DataBase/DB.h" +#include "DataBase/Options.h" using namespace std; using boost::tie; @@ -40,89 +40,96 @@ DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Jared Simpson and Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Jared Simpson and Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... LEN PATH\n" -"Merge sequences of contigs IDs.\n" -"\n" -" Arguments:\n" -"\n" -" LEN lengths of the contigs\n" -" PATH sequences of contig IDs\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=KMER_SIZE k-mer size\n" -" -s, --seed-length=L minimum length of a seed contig [0]\n" -" -G, --genome-size=N expected genome size. Used to calculate NG50\n" -" and associated stats [disabled]\n" -" -o, --out=FILE write result to FILE\n" -" --no-greedy use the non-greedy algorithm [default]\n" -" --greedy use the greedy algorithm\n" -" -g, --graph=FILE write the path overlap graph to FILE\n" -" -j, --threads=N use N parallel threads [1]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for database\n" -" --strain=NAME specify strain NAME for database\n" -" --species=NAME specify species NAME for database\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... LEN PATH\n" + "Merge sequences of contigs IDs.\n" + "\n" + " Arguments:\n" + "\n" + " LEN lengths of the contigs\n" + " PATH sequences of contig IDs\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=KMER_SIZE k-mer size\n" + " -s, --seed-length=L minimum length of a seed contig [0]\n" + " -G, --genome-size=N expected genome size. Used to calculate NG50\n" + " and associated stats [disabled]\n" + " -o, --out=FILE write result to FILE\n" + " --no-greedy use the non-greedy algorithm [default]\n" + " --greedy use the greedy algorithm\n" + " -g, --graph=FILE write the path overlap graph to FILE\n" + " -j, --threads=N use N parallel threads [1]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for database\n" + " --strain=NAME specify strain NAME for database\n" + " --species=NAME specify species NAME for database\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - unsigned k; // used by GraphIO - static string out; - static int threads = 1; - - /** Minimum length of a seed contig. */ - static unsigned seedLen; +string db; +dbVars metaVars; +unsigned k; // used by GraphIO +static string out; +static int threads = 1; + +/** Minimum length of a seed contig. */ +static unsigned seedLen; - /** Use a greedy algorithm. */ - static int greedy; +/** Use a greedy algorithm. */ +static int greedy; - /** Genome size. Used to calculate NG50. */ - static long long unsigned genomeSize; +/** Genome size. Used to calculate NG50. */ +static long long unsigned genomeSize; - /** Write the path overlap graph to this file. */ - static string graphPath; +/** Write the path overlap graph to this file. */ +static string graphPath; } static const char shortopts[] = "G:g:j:k:o:s:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; -//enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "genome-size", required_argument, NULL, 'G' }, - { "graph", no_argument, NULL, 'g' }, - { "greedy", no_argument, &opt::greedy, true }, - { "no-greedy", no_argument, &opt::greedy, false }, - { "kmer", required_argument, NULL, 'k' }, - { "out", required_argument, NULL, 'o' }, - { "seed-length", required_argument, NULL, 's' }, - { "threads", required_argument, NULL, 'j' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; +// enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { { "genome-size", required_argument, NULL, 'G' }, + { "graph", no_argument, NULL, 'g' }, + { "greedy", no_argument, &opt::greedy, true }, + { "no-greedy", no_argument, &opt::greedy, false }, + { "kmer", required_argument, NULL, 'k' }, + { "out", required_argument, NULL, 'o' }, + { "seed-length", required_argument, NULL, 's' }, + { "threads", required_argument, NULL, 'j' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; typedef map ContigPathMap; /** Orientation of an edge. */ -enum dir_type { +enum dir_type +{ DIR_X, // u--v none DIR_F, // u->v forward DIR_R, // u<-v reverse @@ -132,19 +139,22 @@ /** Lengths of contigs measured in k-mer. */ typedef vector Lengths; -static ContigPath align(const Lengths& lengths, - const ContigPath& p1, const ContigPath& p2, - ContigNode pivot); -static ContigPath align(const Lengths& lengths, - const ContigPath& p1, const ContigPath& p2, - ContigNode pivot, dir_type& orientation); +static ContigPath +align(const Lengths& lengths, const ContigPath& p1, const ContigPath& p2, ContigNode pivot); +static ContigPath +align( + const Lengths& lengths, + const ContigPath& p1, + const ContigPath& p2, + ContigNode pivot, + dir_type& orientation); static bool gDebugPrint; /** - * Build a histogram of the lengths of the assembled paths and unused contigs. - * Note: This function does not account for the ammount of overlap between contigs. - */ + * Build a histogram of the lengths of the assembled paths and unused contigs. + * Note: This function does not account for the ammount of overlap between contigs. + */ static Histogram buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths) { @@ -153,8 +163,7 @@ // Compute the lengths of the paths // Mark the vertices that are used in paths vector used(lengths.size()); - for (ContigPaths::const_iterator pathIt = paths.begin(); - pathIt != paths.end(); ++pathIt) { + for (ContigPaths::const_iterator pathIt = paths.begin(); pathIt != paths.end(); ++pathIt) { const ContigPath& path = *pathIt; size_t totalLength = 0; for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { @@ -184,25 +193,23 @@ Histogram h = buildAssembledLengthHistogram(lengths, paths); const unsigned STATS_MIN_LENGTH = opt::seedLen; printContiguityStats(cerr, h, STATS_MIN_LENGTH, true, "\t", opt::genomeSize) - << '\t' << opt::out << '\n'; + << '\t' << opt::out << '\n'; } /** Return all contigs that are tandem repeats, identified as those * contigs that appear more than once in a single path. */ -static set findRepeats(const ContigPathMap& paths) +static set +findRepeats(const ContigPathMap& paths) { set repeats; - for (ContigPathMap::const_iterator pathIt = paths.begin(); - pathIt != paths.end(); ++pathIt) { + for (ContigPathMap::const_iterator pathIt = paths.begin(); pathIt != paths.end(); ++pathIt) { const ContigPath& path = pathIt->second; map count; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) if (!it->ambiguous()) count[it->contigIndex()]++; - for (map::const_iterator - it = count.begin(); it != count.end(); ++it) + for (map::const_iterator it = count.begin(); it != count.end(); ++it) if (it->second > 1) repeats.insert(it->first); } @@ -212,14 +219,14 @@ /** Remove tandem repeats from the set of paths. * @return the removed paths */ -static set removeRepeats(ContigPathMap& paths) +static set +removeRepeats(ContigPathMap& paths) { set repeats = findRepeats(paths); if (gDebugPrint) { cout << "Repeats:"; if (!repeats.empty()) { - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) cout << ' ' << get(g_contigNames, *it); } else cout << " none"; @@ -227,8 +234,7 @@ } unsigned removed = 0; - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) if (paths.count(*it) > 0) removed++; if (removed == paths.size()) { @@ -239,8 +245,7 @@ } ostringstream ss; - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) if (paths.erase(*it) > 0) ss << ' ' << get(g_contigNames, *it); @@ -249,42 +254,46 @@ return repeats; } -static void appendToMergeQ(deque& mergeQ, - set& seen, const ContigPath& path) +static void +appendToMergeQ(deque& mergeQ, set& seen, const ContigPath& path) { - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) if (!it->ambiguous() && seen.insert(*it).second) mergeQ.push_back(*it); } /** A path overlap graph. */ -typedef ContigGraph > PathGraph; +typedef ContigGraph> PathGraph; /** Add an edge if the two paths overlap. * @param pivot the pivot at which to seed the alignment * @return whether an overlap was found */ -static bool addOverlapEdge(const Lengths& lengths, - PathGraph& gout, ContigNode pivot, - ContigNode seed1, const ContigPath& path1, - ContigNode seed2, const ContigPath& path2) +static bool +addOverlapEdge( + const Lengths& lengths, + PathGraph& gout, + ContigNode pivot, + ContigNode seed1, + const ContigPath& path1, + ContigNode seed2, + const ContigPath& path2) { assert(seed1 != seed2); // Determine the orientation of the overlap edge. dir_type orientation = DIR_X; - ContigPath consensus = align(lengths, - path1, path2, pivot, orientation); + ContigPath consensus = align(lengths, path1, path2, pivot, orientation); if (consensus.empty()) return false; assert(orientation != DIR_X); if (orientation == DIR_B) { // One of the paths subsumes the other. Use the order of the // seeds to determine the orientation of the edge. - orientation = find(consensus.begin(), consensus.end(), seed1) - < find(consensus.begin(), consensus.end(), seed2) - ? DIR_F : DIR_R; + orientation = find(consensus.begin(), consensus.end(), seed1) < + find(consensus.begin(), consensus.end(), seed2) + ? DIR_F + : DIR_R; } assert(orientation == DIR_F || orientation == DIR_R); @@ -301,7 +310,8 @@ } /** Return the specified path. */ -static ContigPath getPath(const ContigPathMap& paths, ContigNode u) +static ContigPath +getPath(const ContigPathMap& paths, ContigNode u) { ContigPathMap::const_iterator it = paths.find(u.contigIndex()); assert(it != paths.end()); @@ -312,45 +322,47 @@ } /** Find the overlaps between paths and add edges to the graph. */ -static void findPathOverlaps(const Lengths& lengths, - const ContigPathMap& paths, - const ContigNode& seed1, const ContigPath& path1, - PathGraph& gout) +static void +findPathOverlaps( + const Lengths& lengths, + const ContigPathMap& paths, + const ContigNode& seed1, + const ContigPath& path1, + PathGraph& gout) { - for (ContigPath::const_iterator it = path1.begin(); - it != path1.end(); ++it) { + for (ContigPath::const_iterator it = path1.begin(); it != path1.end(); ++it) { ContigNode seed2 = *it; if (seed1 == seed2) continue; if (seed2.ambiguous()) continue; - ContigPathMap::const_iterator path2It - = paths.find(seed2.contigIndex()); + ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; if (seed2.sense()) reverseComplement(path2.begin(), path2.end()); - addOverlapEdge(lengths, - gout, seed2, seed1, path1, seed2, path2); + addOverlapEdge(lengths, gout, seed2, seed1, path1, seed2, path2); } } /** Attempt to merge the paths specified in mergeQ with path. * @return the number of paths merged */ -static unsigned mergePaths(const Lengths& lengths, - ContigPath& path, - deque& mergeQ, set& seen, - const ContigPathMap& paths) +static unsigned +mergePaths( + const Lengths& lengths, + ContigPath& path, + deque& mergeQ, + set& seen, + const ContigPathMap& paths) { unsigned merged = 0; deque invalid; for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) { pivot = mergeQ.front(); - ContigPathMap::const_iterator path2It - = paths.find(pivot.contigIndex()); + ContigPathMap::const_iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; @@ -367,9 +379,7 @@ path.swap(consensus); if (gDebugPrint) #pragma omp critical(cout) - cout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << '\t' << path << '\n'; + cout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << '\t' << path << '\n'; merged++; } mergeQ.swap(invalid); @@ -379,13 +389,12 @@ /** Merge the paths of the specified seed path. * @return the merged contig path */ -static ContigPath mergePath(const Lengths& lengths, - const ContigPathMap& paths, const ContigPath& seedPath) +static ContigPath +mergePath(const Lengths& lengths, const ContigPathMap& paths, const ContigPath& seedPath) { assert(!seedPath.empty()); ContigNode seed1 = seedPath.front(); - ContigPathMap::const_iterator path1It - = paths.find(seed1.contigIndex()); + ContigPathMap::const_iterator path1It = paths.find(seed1.contigIndex()); assert(path1It != paths.end()); ContigPath path(path1It->second); if (seedPath.front().sense()) @@ -393,36 +402,27 @@ if (opt::verbose > 1) #pragma omp critical(cout) cout << "\n* " << seedPath << '\n' - << get(g_contigNames, seedPath.front()) - << '\t' << path << '\n'; - for (ContigPath::const_iterator it = seedPath.begin() + 1; - it != seedPath.end(); ++it) { + << get(g_contigNames, seedPath.front()) << '\t' << path << '\n'; + for (ContigPath::const_iterator it = seedPath.begin() + 1; it != seedPath.end(); ++it) { ContigNode seed2 = *it; - ContigPathMap::const_iterator path2It - = paths.find(seed2.contigIndex()); + ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex()); assert(path2It != paths.end()); ContigPath path2 = path2It->second; if (seed2.sense()) reverseComplement(path2.begin(), path2.end()); - ContigNode pivot - = find(path.begin(), path.end(), seed2) != path.end() - ? seed2 : seed1; + ContigNode pivot = find(path.begin(), path.end(), seed2) != path.end() ? seed2 : seed1; ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) { // This seed could be removed from the seed path. if (opt::verbose > 1) #pragma omp critical(cout) - cout << get(g_contigNames, seed2) - << '\t' << path2 << '\n' - << "\tinvalid\n"; + cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << "\tinvalid\n"; } else { path.swap(consensus); if (opt::verbose > 1) #pragma omp critical(cout) - cout << get(g_contigNames, seed2) - << '\t' << path2 << '\n' - << '\t' << path << '\n'; + cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << '\t' << path << '\n'; } seed1 = seed2; } @@ -435,16 +435,15 @@ /** Merge the specified seed paths. * @return the merged contig paths */ -static ContigPaths mergeSeedPaths(const Lengths& lengths, - const ContigPathMap& paths, const ContigPaths& seedPaths) +static ContigPaths +mergeSeedPaths(const Lengths& lengths, const ContigPathMap& paths, const ContigPaths& seedPaths) { if (opt::verbose > 0) cout << "\nMerging paths\n"; ContigPaths out; out.reserve(seedPaths.size()); - for (ContigPaths::const_iterator it = seedPaths.begin(); - it != seedPaths.end(); ++it) + for (ContigPaths::const_iterator it = seedPaths.begin(); it != seedPaths.end(); ++it) out.push_back(mergePath(lengths, paths, *it)); return out; } @@ -452,23 +451,21 @@ /** Extend the specified path as long as is unambiguously possible and * add the result to the specified container. */ -static void extendPaths(const Lengths& lengths, - ContigID id, const ContigPathMap& paths, - ContigPathMap& out) +static void +extendPaths(const Lengths& lengths, ContigID id, const ContigPathMap& paths, ContigPathMap& out) { ContigPathMap::const_iterator pathIt = paths.find(id); assert(pathIt != paths.end()); pair inserted; - #pragma omp critical(out) +#pragma omp critical(out) inserted = out.insert(*pathIt); assert(inserted.second); ContigPath& path = inserted.first->second; if (gDebugPrint) - #pragma omp critical(cout) - cout << "\n* " << get(g_contigNames, id) << "+\n" - << '\t' << path << '\n'; +#pragma omp critical(cout) + cout << "\n* " << get(g_contigNames, id) << "+\n" << '\t' << path << '\n'; set seen; seen.insert(ContigNode(id, false)); @@ -478,34 +475,33 @@ ; if (!mergeQ.empty() && gDebugPrint) { - #pragma omp critical(cout) +#pragma omp critical(cout) { cout << "invalid\n"; - for (deque::const_iterator it - = mergeQ.begin(); it != mergeQ.end(); ++it) - cout << get(g_contigNames, *it) << '\t' - << paths.find(it->contigIndex())->second << '\n'; + for (deque::const_iterator it = mergeQ.begin(); it != mergeQ.end(); ++it) + cout << get(g_contigNames, *it) << '\t' << paths.find(it->contigIndex())->second + << '\n'; } } } /** Return true if the contigs are equal or both are ambiguous. */ -static bool equalOrBothAmbiguos(const ContigNode& a, - const ContigNode& b) +static bool +equalOrBothAmbiguos(const ContigNode& a, const ContigNode& b) { return a == b || (a.ambiguous() && b.ambiguous()); } /** Return true if both paths are equal, ignoring ambiguous nodes. */ -static bool equalIgnoreAmbiguos(const ContigPath& a, - const ContigPath& b) +static bool +equalIgnoreAmbiguos(const ContigPath& a, const ContigPath& b) { - return a.size() == b.size() - && equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos); + return a.size() == b.size() && equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos); } /** Return whether this path is a cycle. */ -static bool isCycle(const Lengths& lengths, const ContigPath& path) +static bool +isCycle(const Lengths& lengths, const ContigPath& path) { return !align(lengths, path, path, path.front()).empty(); } @@ -514,27 +510,26 @@ * @param overlaps [out] paths that are found to overlap * @return the ID of the subsuming path */ -static ContigID identifySubsumedPaths(const Lengths& lengths, - ContigPathMap::const_iterator path1It, - ContigPathMap& paths, - set& out, - set& overlaps) +static ContigID +identifySubsumedPaths( + const Lengths& lengths, + ContigPathMap::const_iterator path1It, + ContigPathMap& paths, + set& out, + set& overlaps) { ostringstream vout; out.clear(); ContigID id(path1It->first); const ContigPath& path = path1It->second; if (gDebugPrint) - vout << get(g_contigNames, ContigNode(id, false)) - << '\t' << path << '\n'; + vout << get(g_contigNames, ContigNode(id, false)) << '\t' << path << '\n'; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { ContigNode pivot = *it; if (pivot.ambiguous() || pivot.id() == id) continue; - ContigPathMap::iterator path2It - = paths.find(pivot.contigIndex()); + ContigPathMap::iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; @@ -545,13 +540,11 @@ continue; if (equalIgnoreAmbiguos(consensus, path)) { if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'; out.insert(path2It->first); } else if (equalIgnoreAmbiguos(consensus, path2)) { // This path is larger. Use it as the seed. - return identifySubsumedPaths(lengths, path2It, paths, out, - overlaps); + return identifySubsumedPaths(lengths, path2It, paths, out, overlaps); } else if (isCycle(lengths, consensus)) { // The consensus path is a cycle. bool isCyclePath1 = isCycle(lengths, path); @@ -559,17 +552,15 @@ if (!isCyclePath1 && !isCyclePath2) { // Neither path is a cycle. if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << "ignored\t" << consensus << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' + << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } else { // At least one path is a cycle. if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << "cycle\t" << consensus << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' + << "cycle\t" << consensus << '\n'; if (isCyclePath1 && isCyclePath2) out.insert(path2It->first); else if (!isCyclePath1) @@ -579,9 +570,8 @@ } } else { if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << "ignored\t" << consensus << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' + << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } @@ -595,19 +585,20 @@ * @param overlaps [out] paths that are found to overlap * @return the next iterator after path1it */ -static ContigPathMap::const_iterator removeSubsumedPaths( - const Lengths& lengths, - ContigPathMap::const_iterator path1It, ContigPathMap& paths, - ContigID& seed, set& overlaps) +static ContigPathMap::const_iterator +removeSubsumedPaths( + const Lengths& lengths, + ContigPathMap::const_iterator path1It, + ContigPathMap& paths, + ContigID& seed, + set& overlaps) { if (gDebugPrint) cout << '\n'; set eq; - seed = identifySubsumedPaths(lengths, - path1It, paths, eq, overlaps); + seed = identifySubsumedPaths(lengths, path1It, paths, eq, overlaps); ++path1It; - for (set::const_iterator it = eq.begin(); - it != eq.end(); ++it) { + for (set::const_iterator it = eq.begin(); it != eq.end(); ++it) { if (*it == path1It->first) ++path1It; paths.erase(*it); @@ -618,16 +609,14 @@ /** Remove paths subsumed by another path. * @return paths that are found to overlap */ -static set removeSubsumedPaths(const Lengths& lengths, - ContigPathMap& paths) +static set +removeSubsumedPaths(const Lengths& lengths, ContigPathMap& paths) { set overlaps, seen; - for (ContigPathMap::const_iterator iter = paths.begin(); - iter != paths.end();) { + for (ContigPathMap::const_iterator iter = paths.begin(); iter != paths.end();) { if (seen.count(iter->first) == 0) { ContigID seed; - iter = removeSubsumedPaths(lengths, - iter, paths, seed, overlaps); + iter = removeSubsumedPaths(lengths, iter, paths, seed, overlaps); seen.insert(seed); } else ++iter; @@ -639,8 +628,8 @@ * outgoing edges, (u,v1) and (u,v2), add the edge (v1,v2) if v1 < v2, * and add the edge (v2,v1) if v2 < v1. */ -static void addMissingEdges(const Lengths& lengths, - PathGraph& g, const ContigPathMap& paths) +static void +addMissingEdges(const Lengths& lengths, PathGraph& g, const ContigPathMap& paths) { typedef graph_traits::adjacency_iterator Vit; typedef graph_traits::vertex_iterator Uit; @@ -658,8 +647,7 @@ ++vit1; assert(v1 != u); ContigPath path1 = getPath(paths, v1); - if (find(path1.begin(), path1.end(), - ContigPath::value_type(u)) == path1.end()) + if (find(path1.begin(), path1.end(), ContigPath::value_type(u)) == path1.end()) continue; for (Vit vit2 = vit1; vit2 != vrange.second; ++vit2) { V v2 = *vit2; @@ -668,11 +656,9 @@ if (edge(v1, v2, g).second || edge(v2, v1, g).second) continue; ContigPath path2 = getPath(paths, v2); - if (find(path2.begin(), path2.end(), - ContigPath::value_type(u)) == path2.end()) + if (find(path2.begin(), path2.end(), ContigPath::value_type(u)) == path2.end()) continue; - numAdded += addOverlapEdge(lengths, - g, u, v1, path1, v2, path2); + numAdded += addOverlapEdge(lengths, g, u, v1, path1, v2, path2); } } } @@ -683,15 +669,15 @@ } /** Remove transitive edges. */ -static void removeTransitiveEdges(PathGraph& pathGraph) +static void +removeTransitiveEdges(PathGraph& pathGraph) { unsigned nbefore = num_edges(pathGraph); unsigned nremoved = remove_transitive_edges(pathGraph); unsigned nafter = num_edges(pathGraph); if (opt::verbose > 0) - cout << "Removed " << nremoved << " transitive edges of " - << nbefore << " edges leaving " - << nafter << " edges.\n"; + cout << "Removed " << nremoved << " transitive edges of " << nbefore << " edges leaving " + << nafter << " edges.\n"; assert(nbefore - nremoved == nafter); if (!opt::db.empty()) { addToDb(db, "Edges_init", nbefore); @@ -703,8 +689,8 @@ * Remove the edge (u,v) if deg+(u) > 1 and deg-(v) > 1 and the * overlap of (u,v) is small. */ -static void removeSmallOverlaps(PathGraph& g, - const ContigPathMap& paths) +static void +removeSmallOverlaps(PathGraph& g, const ContigPathMap& paths) { typedef graph_traits::edge_descriptor E; typedef graph_traits::out_edge_iterator Eit; @@ -726,21 +712,20 @@ if (in_degree(v, g) < 2) continue; ContigPath pathv = getPath(paths, v); - if (pathu.back() == pathv.front() - && paths.count(pathu.back().contigIndex()) > 0) + if (pathu.back() == pathv.front() && paths.count(pathu.back().contigIndex()) > 0) edges.push_back(uv); } } remove_edges(g, edges.begin(), edges.end()); if (opt::verbose > 0) - cout << "Removed " << edges.size() - << " small overlap edges.\n"; + cout << "Removed " << edges.size() << " small overlap edges.\n"; if (!opt::db.empty()) addToDb(db, "Edges_removed_small_overlap", edges.size()); } /** Output the path overlap graph. */ -static void outputPathGraph(PathGraph& pathGraph) +static void +outputPathGraph(PathGraph& pathGraph) { if (opt::graphPath.empty()) return; @@ -751,20 +736,23 @@ } /** Sort and output the specified paths. */ -static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths) +static void +outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths) { // Sort the paths. vector sortedPaths(paths.size()); - transform(paths.begin(), paths.end(), sortedPaths.begin(), - mem_var(&ContigPathMap::value_type::second)); + transform( + paths.begin(), + paths.end(), + sortedPaths.begin(), + mem_var(&ContigPathMap::value_type::second)); sort(sortedPaths.begin(), sortedPaths.end()); // Output the paths. ofstream fout(opt::out.c_str()); ostream& out = opt::out.empty() ? cout : fout; assert_good(out, opt::out); - for (vector::const_iterator it = sortedPaths.begin(); - it != sortedPaths.end(); ++it) + for (vector::const_iterator it = sortedPaths.begin(); it != sortedPaths.end(); ++it) out << createContigName() << '\t' << *it << '\n'; assert_good(out, opt::out); @@ -773,28 +761,24 @@ } /** Assemble the path overlap graph. */ -static void assemblePathGraph(const Lengths& lengths, - PathGraph& pathGraph, ContigPathMap& paths) +static void +assemblePathGraph(const Lengths& lengths, PathGraph& pathGraph, ContigPathMap& paths) { ContigPaths seedPaths; assembleDFS(pathGraph, back_inserter(seedPaths)); - ContigPaths mergedPaths = mergeSeedPaths(lengths, - paths, seedPaths); + ContigPaths mergedPaths = mergeSeedPaths(lengths, paths, seedPaths); if (opt::verbose > 1) cout << '\n'; // Replace each path with the merged path. - for (ContigPaths::const_iterator it1 = seedPaths.begin(); - it1 != seedPaths.end(); ++it1) { + for (ContigPaths::const_iterator it1 = seedPaths.begin(); it1 != seedPaths.end(); ++it1) { const ContigPath& path(mergedPaths[it1 - seedPaths.begin()]); ContigPath pathrc(path); reverseComplement(pathrc.begin(), pathrc.end()); - for (ContigPath::const_iterator it2 = it1->begin(); - it2 != it1->end(); ++it2) { + for (ContigPath::const_iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { ContigNode seed(*it2); if (find(path.begin(), path.end(), seed) != path.end()) { - paths[seed.contigIndex()] - = seed.sense() ? pathrc : path; + paths[seed.contigIndex()] = seed.sense() ? pathrc : path; } else { // This seed was not included in the merged path. } @@ -812,8 +796,8 @@ } /** Read a set of paths from the specified file. */ -static ContigPathMap readPaths(const Lengths& lengths, - const string& filePath) +static ContigPathMap +readPaths(const Lengths& lengths, const string& filePath) { if (opt::verbose > 0) cerr << "Reading `" << filePath << "'..." << endl; @@ -833,17 +817,15 @@ continue; } - bool inserted = paths.insert( - make_pair(id, path)).second; + bool inserted = paths.insert(make_pair(id, path)).second; assert(inserted); (void)inserted; } assert(in.eof()); if (opt::seedLen > 0) - cout << "Ignored " << tooSmall - << " paths whose seeds are shorter than " - << opt::seedLen << " bp.\n"; + cout << "Ignored " << tooSmall << " paths whose seeds are shorter than " << opt::seedLen + << " bp.\n"; return paths; } @@ -851,16 +833,17 @@ * @return true if out != last */ template -bool atomicInc(T1& it, T2 last, T3& out) +bool +atomicInc(T1& it, T2 last, T3& out) { - #pragma omp critical(atomicInc) +#pragma omp critical(atomicInc) out = it == last ? it : it++; return out != last; } /** Build the path overlap graph. */ -static void buildPathGraph(const Lengths& lengths, - PathGraph& g, const ContigPathMap& paths) +static void +buildPathGraph(const Lengths& lengths, PathGraph& g, const ContigPathMap& paths) { // Create the vertices of the path overlap graph. PathGraph(lengths.size()).swap(g); @@ -874,11 +857,9 @@ // Find the overlapping paths. ContigPathMap::const_iterator sharedIt = paths.begin(); - #pragma omp parallel - for (ContigPathMap::const_iterator it; - atomicInc(sharedIt, paths.end(), it);) - findPathOverlaps(lengths, paths, - ContigNode(it->first, false), it->second, g); +#pragma omp parallel + for (ContigPathMap::const_iterator it; atomicInc(sharedIt, paths.end(), it);) + findPathOverlaps(lengths, paths, ContigNode(it->first, false), it->second, g); if (gDebugPrint) cout << '\n'; @@ -890,23 +871,23 @@ // graph statistics vector vals = passGraphStatsVal(g); - vector keys = make_vector() - << "V" - << "E" - << "degree0pctg" - << "degree1pctg" - << "degree234pctg" - << "degree5pctg" - << "degree_max"; + vector keys = make_vector() << "V" + << "E" + << "degree0pctg" + << "degree1pctg" + << "degree234pctg" + << "degree5pctg" + << "degree_max"; if (!opt::db.empty()) { - for (unsigned i=0; i> x; - opt::genomeSize = x; - break; - } - case 'g': arg >> opt::graphPath; break; - case 'j': arg >> opt::threads; break; - case 'k': arg >> opt::k; break; - case 'o': arg >> opt::out; break; - case 's': arg >> opt::seedLen; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; break; + case '?': + die = true; + break; + case 'G': { + double x; + arg >> x; + opt::genomeSize = x; + break; + } + case 'g': + arg >> opt::graphPath; + break; + case 'j': + arg >> opt::threads; + break; + case 'k': + arg >> opt::k; + break; + case 'o': + arg >> opt::out; + break; + case 's': + arg >> opt::seedLen; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -994,8 +993,7 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -1013,18 +1011,11 @@ cerr << "Reading `" << argv[optind] << "'..." << endl; if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars - ); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); } Lengths lengths = readContigLengths(argv[optind++]); - ContigPathMap originalPathMap = readPaths( - lengths, argv[optind++]); + ContigPathMap originalPathMap = readPaths(lengths, argv[optind++]); removeRepeats(originalPathMap); @@ -1042,16 +1033,13 @@ ContigPathMap resultsPathMap; #if _OPENMP ContigPathMap::iterator sharedIt = originalPathMap.begin(); - #pragma omp parallel - for (ContigPathMap::iterator it; - atomicInc(sharedIt, originalPathMap.end(), it);) - extendPaths(lengths, - it->first, originalPathMap, resultsPathMap); +#pragma omp parallel + for (ContigPathMap::iterator it; atomicInc(sharedIt, originalPathMap.end(), it);) + extendPaths(lengths, it->first, originalPathMap, resultsPathMap); #else - for (ContigPathMap::const_iterator it = originalPathMap.begin(); - it != originalPathMap.end(); ++it) - extendPaths(lengths, - it->first, originalPathMap, resultsPathMap); + for (ContigPathMap::const_iterator it = originalPathMap.begin(); it != originalPathMap.end(); + ++it) + extendPaths(lengths, it->first, originalPathMap, resultsPathMap); #endif if (gDebugPrint) cout << '\n'; @@ -1060,27 +1048,23 @@ if (gDebugPrint) cout << "\nRemoving redundant contigs\n"; - set overlaps = removeSubsumedPaths(lengths, - resultsPathMap); + set overlaps = removeSubsumedPaths(lengths, resultsPathMap); if (!overlaps.empty() && !repeats.empty()) { // Remove the newly-discovered repeat contigs from the // original paths. - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) originalPathMap.erase(*it); // Reassemble the paths that were found to overlap. if (gDebugPrint) { cout << "\nReassembling overlapping contigs:"; - for (set::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) + for (set::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) cout << ' ' << get(g_contigNames, *it); cout << '\n'; } - for (set::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) { + for (set::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) { if (originalPathMap.count(*it) == 0) continue; // repeat ContigPathMap::iterator oldIt = resultsPathMap.find(*it); @@ -1088,8 +1072,7 @@ continue; // subsumed ContigPath old = oldIt->second; resultsPathMap.erase(oldIt); - extendPaths(lengths, - *it, originalPathMap, resultsPathMap); + extendPaths(lengths, *it, originalPathMap, resultsPathMap); if (gDebugPrint) { if (resultsPathMap[*it] == old) cout << "no change\n"; @@ -1104,8 +1087,7 @@ overlaps = removeSubsumedPaths(lengths, resultsPathMap); if (!overlaps.empty() && gDebugPrint) { cout << "\nOverlapping contigs:"; - for (set::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) + for (set::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) cout << ' ' << get(g_contigNames, *it); cout << '\n'; } @@ -1117,21 +1099,23 @@ } /** Return the length of the specified contig in k-mer. */ -static unsigned getLength(const Lengths& lengths, - const ContigNode& u) +static unsigned +getLength(const Lengths& lengths, const ContigNode& u) { - return u.ambiguous() ? u.length() - : lengths.at(u.id()); + return u.ambiguous() ? u.length() : lengths.at(u.id()); } /** Functor to add the number of k-mer in two contigs. */ struct AddLength { - AddLength(const Lengths& lengths) : m_lengths(lengths) { } + AddLength(const Lengths& lengths) + : m_lengths(lengths) + {} unsigned operator()(unsigned addend, const ContigNode& u) const { return addend + getLength(m_lengths, u); } + private: const Lengths& m_lengths; }; @@ -1141,10 +1125,15 @@ * found. * @return true if an alignment is found */ -template -static bool alignCoordinates(const Lengths& lengths, - iterator& first1, iterator last1, - iterator& first2, iterator last2, oiterator& result) +template +static bool +alignCoordinates( + const Lengths& lengths, + iterator& first1, + iterator last1, + iterator& first2, + iterator last2, + oiterator& result) { oiterator out = result; @@ -1206,10 +1195,15 @@ * the consensus at out if an alignment is found. * @return true if an alignment is found */ -template -static bool buildConsensus(const Lengths& lengths, - iterator it1, iterator it1e, - iterator it2, iterator it2e, oiterator& out) +template +static bool +buildConsensus( + const Lengths& lengths, + iterator it1, + iterator it1e, + iterator it2, + iterator it2e, + oiterator& out) { iterator it1b = it1 + 1; assert(!it1b->ambiguous()); @@ -1229,21 +1223,16 @@ unsigned ambiguous1 = it1->length(); unsigned ambiguous2 = it2a->length(); - unsigned unambiguous1 = accumulate(it1b, it1e, - 0, AddLength(lengths)); - unsigned unambiguous2 = accumulate(it2, it2a, - 0, AddLength(lengths)); - if (ambiguous1 < unambiguous2 - || ambiguous2 < unambiguous1) { + unsigned unambiguous1 = accumulate(it1b, it1e, 0, AddLength(lengths)); + unsigned unambiguous2 = accumulate(it2, it2a, 0, AddLength(lengths)); + if (ambiguous1 < unambiguous2 || ambiguous2 < unambiguous1) { // Two gaps overlap and either of the gaps is smaller // than the unambiguous sequence that overlaps the // gap. No alignment. return false; } - unsigned n = max(1U, - max(ambiguous2 - unambiguous1, - ambiguous1 - unambiguous2)); + unsigned n = max(1U, max(ambiguous2 - unambiguous1, ambiguous1 - unambiguous2)); out = copy(it2, it2a, out); *out++ = ContigNode(n, 'N'); out = copy(it1b, it1e, out); @@ -1255,10 +1244,16 @@ * in it1 and it2. * @return true if an alignment is found */ -template -static bool alignAtSeed(const Lengths& lengths, - iterator& it1, iterator it1e, iterator last1, - iterator& it2, iterator last2, oiterator& out) +template +static bool +alignAtSeed( + const Lengths& lengths, + iterator& it1, + iterator it1e, + iterator last1, + iterator& it2, + iterator last2, + oiterator& out) { assert(it1 != last1); assert(it1->ambiguous()); @@ -1270,11 +1265,10 @@ // fewest number of contigs in the consensus sequence. unsigned bestLen = UINT_MAX; iterator bestIt2e; - for (iterator it2e = it2; - (it2e = find(it2e, last2, *it1e)) != last2; ++it2e) { + for (iterator it2e = it2; (it2e = find(it2e, last2, *it1e)) != last2; ++it2e) { oiterator myOut = out; - if (buildConsensus(lengths, it1, it1e, it2, it2e, myOut) - && align(lengths, it1e, last1, it2e, last2, myOut)) { + if (buildConsensus(lengths, it1, it1e, it2, it2e, myOut) && + align(lengths, it1e, last1, it2e, last2, myOut)) { unsigned len = myOut - out; if (len <= bestLen) { bestLen = len; @@ -1283,8 +1277,7 @@ } } if (bestLen != UINT_MAX) { - bool good = buildConsensus(lengths, - it1, it1e, it2, bestIt2e, out); + bool good = buildConsensus(lengths, it1, it1e, it2, bestIt2e, out); assert(good); it1 = it1e; it2 = bestIt2e; @@ -1297,10 +1290,15 @@ * The end of the alignment is returned in it1 and it2. * @return true if an alignment is found */ -template -static bool alignAmbiguous(const Lengths& lengths, - iterator& it1, iterator last1, - iterator& it2, iterator last2, oiterator& out) +template +static bool +alignAmbiguous( + const Lengths& lengths, + iterator& it1, + iterator last1, + iterator& it2, + iterator last2, + oiterator& out) { assert(it1 != last1); assert(it1->ambiguous()); @@ -1324,10 +1322,15 @@ * The end of the alignment is returned in it1 and it2. * @return true if an alignment is found */ -template -static bool alignOne(const Lengths& lengths, - iterator& it1, iterator last1, - iterator& it2, iterator last2, oiterator& out) +template +static bool +alignOne( + const Lengths& lengths, + iterator& it1, + iterator last1, + iterator& it2, + iterator last2, + oiterator& out) { // Check for a trivial alignment. unsigned n1 = last1 - it1, n2 = last2 - it2; @@ -1347,17 +1350,14 @@ return true; } - return - it1->ambiguous() && it2->ambiguous() - ? (it1->length() > it2->length() - ? alignAmbiguous(lengths, it1, last1, it2, last2, out) - : alignAmbiguous(lengths, it2, last2, it1, last1, out) - ) - : it1->ambiguous() - ? alignAmbiguous(lengths, it1, last1, it2, last2, out) - : it2->ambiguous() - ? alignAmbiguous(lengths, it2, last2, it1, last1, out) - : (*out++ = *it1, *it1++ == *it2++); + return it1->ambiguous() && it2->ambiguous() + ? (it1->length() > it2->length() + ? alignAmbiguous(lengths, it1, last1, it2, last2, out) + : alignAmbiguous(lengths, it2, last2, it1, last1, out)) + : it1->ambiguous() + ? alignAmbiguous(lengths, it1, last1, it2, last2, out) + : it2->ambiguous() ? alignAmbiguous(lengths, it2, last2, it1, last1, out) + : (*out++ = *it1, *it1++ == *it2++); } /** Align the ambiguous region [it1, last1) to [it2, last2) @@ -1365,10 +1365,15 @@ * @return the orientation of the alignment if an alignments is found * or zero otherwise */ -template -static dir_type align(const Lengths& lengths, - iterator it1, iterator last1, - iterator it2, iterator last2, oiterator& out) +template +static dir_type +align( + const Lengths& lengths, + iterator it1, + iterator last1, + iterator it2, + iterator last2, + oiterator& out) { assert(it1 != last1); assert(it2 != last2); @@ -1379,9 +1384,7 @@ out = copy(it1, last1, out); out = copy(it2, last2, out); return it1 == last1 && it2 == last2 ? DIR_B - : it1 == last1 ? DIR_F - : it2 == last2 ? DIR_R - : DIR_X; + : it1 == last1 ? DIR_F : it2 == last2 ? DIR_R : DIR_X; } /** Find an equivalent region of the two specified paths, starting the @@ -1389,27 +1392,27 @@ * @param[out] orientation the orientation of the alignment * @return the consensus sequence */ -static ContigPath align(const Lengths& lengths, - const ContigPath& p1, const ContigPath& p2, - ContigPath::const_iterator pivot1, - ContigPath::const_iterator pivot2, - dir_type& orientation) +static ContigPath +align( + const Lengths& lengths, + const ContigPath& p1, + const ContigPath& p2, + ContigPath::const_iterator pivot1, + ContigPath::const_iterator pivot2, + dir_type& orientation) { assert(*pivot1 == *pivot2); - ContigPath::const_reverse_iterator - rit1 = ContigPath::const_reverse_iterator(pivot1+1), - rit2 = ContigPath::const_reverse_iterator(pivot2+1); + ContigPath::const_reverse_iterator rit1 = ContigPath::const_reverse_iterator(pivot1 + 1), + rit2 = ContigPath::const_reverse_iterator(pivot2 + 1); ContigPath alignmentr(p1.rend() - rit1 + p2.rend() - rit2); ContigPath::iterator rout = alignmentr.begin(); - dir_type alignedr = align(lengths, - rit1, p1.rend(), rit2, p2.rend(), rout); + dir_type alignedr = align(lengths, rit1, p1.rend(), rit2, p2.rend(), rout); alignmentr.erase(rout, alignmentr.end()); ContigPath::const_iterator it1 = pivot1, it2 = pivot2; ContigPath alignmentf(p1.end() - it1 + p2.end() - it2); ContigPath::iterator fout = alignmentf.begin(); - dir_type alignedf = align(lengths, - it1, p1.end(), it2, p2.end(), fout); + dir_type alignedf = align(lengths, it1, p1.end(), it2, p2.end(), fout); alignmentf.erase(fout, alignmentf.end()); ContigPath consensus; @@ -1417,10 +1420,9 @@ // Found an alignment. assert(!alignmentf.empty()); assert(!alignmentr.empty()); - consensus.reserve(alignmentr.size()-1 + alignmentf.size()); - consensus.assign(alignmentr.rbegin(), alignmentr.rend()-1); - consensus.insert(consensus.end(), - alignmentf.begin(), alignmentf.end()); + consensus.reserve(alignmentr.size() - 1 + alignmentf.size()); + consensus.assign(alignmentr.rbegin(), alignmentr.rend() - 1); + consensus.insert(consensus.end(), alignmentf.begin(), alignmentf.end()); // Determine the orientation of the alignment. unsigned dirs = alignedr << 2 | alignedf; @@ -1452,15 +1454,14 @@ /** Return a pivot suitable for aligning the two paths if one exists, * otherwise return false. */ -static pair findPivot( - const ContigPath& path1, const ContigPath& path2) +static pair +findPivot(const ContigPath& path1, const ContigPath& path2) { - for (ContigPath::const_iterator it = path2.begin(); - it != path2.end(); ++it) { + for (ContigPath::const_iterator it = path2.begin(); it != path2.end(); ++it) { if (it->ambiguous()) continue; - if (count(path2.begin(), path2.end(), *it) == 1 - && count(path1.begin(), path1.end(), *it) == 1) + if (count(path2.begin(), path2.end(), *it) == 1 && + count(path1.begin(), path1.end(), *it) == 1) return make_pair(*it, true); } return make_pair(ContigNode(0), false); @@ -1470,9 +1471,13 @@ * @param[out] orientation the orientation of the alignment * @return the consensus sequence */ -static ContigPath align(const Lengths& lengths, - const ContigPath& path1, const ContigPath& path2, - ContigNode pivot, dir_type& orientation) +static ContigPath +align( + const Lengths& lengths, + const ContigPath& path1, + const ContigPath& path2, + ContigNode pivot, + dir_type& orientation) { if (&path1 == &path2) { // Ignore the trivial alignment when aligning a path to @@ -1482,24 +1487,20 @@ orientation = DIR_B; return path1; } else { - ContigPath::const_iterator it - = search(path1.begin(), path1.end(), - path2.begin(), path2.end()); + ContigPath::const_iterator it = + search(path1.begin(), path1.end(), path2.begin(), path2.end()); if (it != path1.end()) { // path2 is subsumed in path1. // Determine the orientation of the edge. - orientation - = it == path1.begin() ? DIR_R - : it + path2.size() == path1.end() ? DIR_F - : DIR_B; + orientation = + it == path1.begin() ? DIR_R : it + path2.size() == path1.end() ? DIR_F : DIR_B; return path1; } } // Find a suitable pivot. - if (find(path1.begin(), path1.end(), pivot) == path1.end() - || find(path2.begin(), path2.end(), pivot) - == path2.end()) { + if (find(path1.begin(), path1.end(), pivot) == path1.end() || + find(path2.begin(), path2.end(), pivot) == path2.end()) { bool good; tie(pivot, good) = findPivot(path1, path2); if (!good) @@ -1507,29 +1508,26 @@ } assert(find(path1.begin(), path1.end(), pivot) != path1.end()); - ContigPath::const_iterator it2 = find(path2.begin(), path2.end(), - pivot); + ContigPath::const_iterator it2 = find(path2.begin(), path2.end(), pivot); assert(it2 != path2.end()); if (&path1 != &path2) { // The seed must be unique in path2, unless we're aligning a // path to itself. - assert(count(it2+1, path2.end(), pivot) == 0); + assert(count(it2 + 1, path2.end(), pivot) == 0); } ContigPath consensus; for (ContigPath::const_iterator it1 = find_if( - path1.begin(), path1.end(), - bind2nd(equal_to(), pivot)); - it1 != path1.end(); - it1 = find_if(it1+1, path1.end(), - bind2nd(equal_to(), pivot))) { + path1.begin(), path1.end(), [&pivot](const ContigNode& c) { return c == pivot; }); + it1 != path1.end(); + it1 = + find_if(it1 + 1, path1.end(), [&pivot](const ContigNode& c) { return c == pivot; })) { if (&*it1 == &*it2) { // We are aligning a path to itself, and this is the // trivial alignment, which we'll ignore. continue; } - consensus = align(lengths, - path1, path2, it1, it2, orientation); + consensus = align(lengths, path1, path2, it1, it2, orientation); if (!consensus.empty()) return consensus; } @@ -1539,9 +1537,8 @@ /** Find an equivalent region of the two specified paths. * @return the consensus sequence */ -static ContigPath align(const Lengths& lengths, - const ContigPath& path1, const ContigPath& path2, - ContigNode pivot) +static ContigPath +align(const Lengths& lengths, const ContigPath& path1, const ContigPath& path2, ContigNode pivot) { dir_type orientation; return align(lengths, path1, path2, pivot, orientation); diff -Nru abyss-2.2.3/Misc/samtobreak.hs abyss-2.2.4/Misc/samtobreak.hs --- abyss-2.2.3/Misc/samtobreak.hs 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Misc/samtobreak.hs 2020-01-30 18:48:44.000000000 +0000 @@ -279,7 +279,7 @@ where help = putStr (usageInfo usage options) >> exitSuccess tryHelp = "Try 'abyss-samtobreak --help' for more information." - version = "abyss-samtobreak (ABySS) 2.2.3\n" + version = "abyss-samtobreak (ABySS) 2.2.4\n" usage = "Usage: samtobreak [OPTION]... [FILE]...\n\ \Calculate contig and scaffold contiguity and correctness metrics.\n" diff -Nru abyss-2.2.3/ParseAligns/abyss-fixmate.cc abyss-2.2.4/ParseAligns/abyss-fixmate.cc --- abyss-2.2.3/ParseAligns/abyss-fixmate.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/ParseAligns/abyss-fixmate.cc 2020-01-30 18:48:44.000000000 +0000 @@ -1,3 +1,6 @@ +#include "ContigID.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "Histogram.h" #include "IOUtil.h" #include "MemoryUtil.h" @@ -5,8 +8,8 @@ #include "StringUtil.h" #include "Uncompress.h" #include "UnorderedMap.h" -#include "ContigID.h" #include +#include #include #include #include @@ -14,9 +17,6 @@ #include #include #include -#include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; @@ -25,47 +25,47 @@ DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " [OPTION]... [FILE]...\n" -"Write read pairs that map to the same contig to the file SAME.\n" -"Write read pairs that map to different contigs to stdout.\n" -"Alignments may be in FILE(s) or standard input.\n" -"\n" -" Options:\n" -"\n" -" --no-qname set the qname to * [default]\n" -" --qname do not alter the qname\n" -" --all print all alignments\n" -" --diff print alignments that align to different\n" -" contigs [default]\n" -" -l, --min-align=N the minimal alignment size [1]\n" -" -s, --same=SAME write properly-paired reads to this file\n" -" -h, --hist=FILE write the fragment size histogram to FILE\n" -" -c, --cov=FILE write the physical coverage to FILE\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for sqlite\n" -" --strain=NAME specify strain NAME for sqlite\n" -" --species=NAME specify species NAME for sqlite\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " [OPTION]... [FILE]...\n" + "Write read pairs that map to the same contig to the file SAME.\n" + "Write read pairs that map to different contigs to stdout.\n" + "Alignments may be in FILE(s) or standard input.\n" + "\n" + " Options:\n" + "\n" + " --no-qname set the qname to * [default]\n" + " --qname do not alter the qname\n" + " --all print all alignments\n" + " --diff print alignments that align to different\n" + " contigs [default]\n" + " -l, --min-align=N the minimal alignment size [1]\n" + " -s, --same=SAME write properly-paired reads to this file\n" + " -h, --hist=FILE write the fragment size histogram to FILE\n" + " -c, --cov=FILE write the physical coverage to FILE\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for sqlite\n" + " --strain=NAME specify strain NAME for sqlite\n" + " --species=NAME specify species NAME for sqlite\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - static string fragPath; - static string histPath; - static string covPath; - static int qname; - static int verbose; - static int print_all; +string db; +dbVars metaVars; +static string fragPath; +static string histPath; +static string covPath; +static int qname; +static int verbose; +static int print_all; } // for sqlite params @@ -74,28 +74,35 @@ static const char shortopts[] = "h:c:l:s:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; - -static const struct option longopts[] = { - { "qname", no_argument, &opt::qname, 1 }, - { "no-qname", no_argument, &opt::qname, 0 }, - { "all", no_argument, &opt::print_all, 1 }, - { "diff", no_argument, &opt::print_all, 0 }, - { "min-align", required_argument, NULL, 'l' }, - { "hist", required_argument, NULL, 'h' }, - { "cov", required_argument, NULL, 'c' }, - { "same", required_argument, NULL, 's' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; -static struct { +static const struct option longopts[] = { { "qname", no_argument, &opt::qname, 1 }, + { "no-qname", no_argument, &opt::qname, 0 }, + { "all", no_argument, &opt::print_all, 1 }, + { "diff", no_argument, &opt::print_all, 0 }, + { "min-align", required_argument, NULL, 'l' }, + { "hist", required_argument, NULL, 'h' }, + { "cov", required_argument, NULL, 'c' }, + { "same", required_argument, NULL, 's' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; + +static struct +{ size_t alignments; size_t bothUnaligned; size_t oneUnaligned; @@ -106,9 +113,10 @@ static ofstream g_fragFile; static Histogram g_histogram; static ofstream g_covFile; -static vector< vector > g_contigCov; +static vector> g_contigCov; -static void incrementRange(SAMRecord& a) +static void +incrementRange(SAMRecord& a) { unsigned inx = get(g_contigNames, a.rname); g_contigCov[inx][a.pos]++; @@ -119,14 +127,12 @@ g_contigCov[inx][end]--; } -static void handlePair(SAMRecord& a0, SAMRecord& a1) +static void +handlePair(SAMRecord& a0, SAMRecord& a1) { - if ((a0.isRead1() && a1.isRead1()) - || (a0.isRead2() && a1.isRead2())) { - cerr << "error: duplicate read ID `" << a0.qname - << (a0.isRead1() ? "/1" : "") - << (a0.isRead2() ? "/2" : "") - << "'\n"; + if ((a0.isRead1() && a1.isRead1()) || (a0.isRead2() && a1.isRead2())) { + cerr << "error: duplicate read ID `" << a0.qname << (a0.isRead1() ? "/1" : "") + << (a0.isRead2() ? "/2" : "") << "'\n"; exit(EXIT_FAILURE); } @@ -175,7 +181,8 @@ typedef boost::unordered_map Alignments; #endif -static void printProgress(const Alignments& map) +static void +printProgress(const Alignments& map) { if (opt::verbose == 0) return; @@ -189,16 +196,15 @@ prevBuckets = buckets; size_t size = map.size(); cerr << "Read " << stats.alignments << " alignments. " - << "Hash load: " << size << " / " << buckets - << " = " << (float)size / buckets - << " using " << toSI(getMemoryUsage()) << "B." << endl; + << "Hash load: " << size << " / " << buckets << " = " << (float)size / buckets + << " using " << toSI(getMemoryUsage()) << "B." << endl; } } -static void handleAlignment(SAMRecord& sam, Alignments& map) +static void +handleAlignment(SAMRecord& sam, Alignments& map) { - pair it = map.insert( - make_pair(sam.qname, sam)); + pair it = map.insert(make_pair(sam.qname, sam)); if (!it.second) { #if SAM_SEQ_QUAL SAMRecord& a0 = it.first->second; @@ -222,7 +228,8 @@ printProgress(map); } -static void assert_eof(istream& in) +static void +assert_eof(istream& in) { if (in.eof()) return; @@ -234,21 +241,26 @@ } /** Print physical coverage in wiggle format. */ -static void printCov(string file) +static void +printCov(string file) { ofstream out(file.c_str()); for (unsigned i = 0; i < g_contigCov.size(); i++) { out << "variableStep\tchrom=" << get(g_contigNames, i) << '\n'; int prev = g_contigCov[i][0]; - if (prev != 0) out << "0\t" << prev << '\n'; + if (prev != 0) + out << "0\t" << prev << '\n'; for (unsigned j = 1; j < g_contigCov[i].size(); j++) { prev += g_contigCov[i][j]; - if (prev != 0) out << j << "\t" << prev << '\n'; + if (prev != 0) + out << j << "\t" << prev << '\n'; } } } -void parseTag(string line) { +void +parseTag(string line) +{ stringstream ss(line); string tag; char type[2]; @@ -269,10 +281,11 @@ assert(length > 0); assert(id.size() > 0); put(g_contigNames, g_contigCov.size(), id); - g_contigCov.push_back(vector(length)); + g_contigCov.push_back(vector(length)); } -static void readAlignments(istream& in, Alignments* pMap) +static void +readAlignments(istream& in, Alignments* pMap) { for (SAMRecord sam; in >> ws;) { if (in.peek() == '@') { @@ -294,7 +307,8 @@ assert_eof(in); } -static void readAlignmentsFile(string path, Alignments* pMap) +static void +readAlignmentsFile(string path, Alignments* pMap) { if (opt::verbose > 0) cerr << "Reading `" << path << "'..." << endl; @@ -305,103 +319,118 @@ } /** Return the specified number formatted as a percent. */ -static string percent(size_t x, size_t n) +static string +percent(size_t x, size_t n) { ostringstream ss; ss << setw((int)log10(n) + 1) << x; if (x > 0) - ss << " " << setprecision(3) << (float)100*x/n << '%'; + ss << " " << setprecision(3) << (float)100 * x / n << '%'; return ss.str(); } /** Print statistics of the specified histogram. h not passed by - * reference because we want to make a copy + * reference because we want to make a copy **/ -static void printHistogramStats(Histogram h) +static void +printHistogramStats(Histogram h) { unsigned n_orig = h.size(); h.eraseNegative(); h.removeNoise(); h.removeOutliers(); h = h.trimFraction(0.0001); - cerr << "Stats mean: " << setprecision(4) << h.mean() << " " - "median: " << setprecision(4) << h.median() << " " - "sd: " << setprecision(4) << h.sd() << " " - "n: " << h.size() << " " - "min: " << h.minimum() << " " - "max: " << h.maximum() << " " - "ignored: " << n_orig - h.size() << '\n' - << h.barplot() << endl; + cerr << "Stats mean: " << setprecision(4) << h.mean() + << " " + "median: " + << setprecision(4) << h.median() + << " " + "sd: " + << setprecision(4) << h.sd() + << " " + "n: " + << h.size() + << " " + "min: " + << h.minimum() + << " " + "max: " + << h.maximum() + << " " + "ignored: " + << n_orig - h.size() << '\n' + << h.barplot() << endl; if (!opt::db.empty()) { - vals = make_vector() - << (int)round(h.mean()) - << h.median() - << (int)round(h.sd()) - << h.size() - << h.minimum() - << h.maximum() - << n_orig-h.size(); - - keys = make_vector() - << "mean" - << "median" - << "sd" - << "n" - << "min" - << "max" - << "ignored"; + vals = make_vector() << (int)round(h.mean()) << h.median() << (int)round(h.sd()) + << h.size() << h.minimum() << h.maximum() << n_orig - h.size(); + + keys = make_vector() << "mean" + << "median" + << "sd" + << "n" + << "min" + << "max" + << "ignored"; - for (unsigned i=0; i> opt::minAlign; - break; - case 's': arg >> opt::fragPath; break; - case 'h': arg >> opt::histPath; break; - case 'c': arg >> opt::covPath; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; - break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; - break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; - break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; - break; + case '?': + die = true; + break; + case 'l': + arg >> opt::minAlign; + break; + case 's': + arg >> opt::fragPath; + break; + case 'h': + arg >> opt::histPath; + break; + case 'c': + arg >> opt::covPath; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -411,18 +440,13 @@ } if (!opt::db.empty()) - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars - ); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); Alignments alignments(1); if (optind < argc) { - for_each(argv + optind, argv + argc, - bind2nd(ptr_fun(readAlignmentsFile), &alignments)); + for_each(argv + optind, argv + argc, [&alignments](const std::string& s) { + readAlignmentsFile(s, &alignments); + }); } else { if (opt::verbose > 0) cerr << "Reading from standard input..." << endl; @@ -435,8 +459,7 @@ // Print the unpaired alignments. if (opt::print_all) { - for (Alignments::iterator it = alignments.begin(); - it != alignments.end(); it++) { + for (Alignments::iterator it = alignments.begin(); it != alignments.end(); it++) { #if SAM_SEQ_QUAL SAMRecord& a0 = it->second; #else @@ -450,49 +473,52 @@ unsigned numRF = g_histogram.count(INT_MIN, 0); unsigned numFR = g_histogram.count(1, INT_MAX); - size_t sum = alignments.size() - + stats.bothUnaligned + stats.oneUnaligned - + numFR + numRF + stats.numFF - + stats.numDifferent; - cerr << - "Mateless " << percent(alignments.size(), sum) << "\n" - "Unaligned " << percent(stats.bothUnaligned, sum) << "\n" - "Singleton " << percent(stats.oneUnaligned, sum) << "\n" - "FR " << percent(numFR, sum) << "\n" - "RF " << percent(numRF, sum) << "\n" - "FF " << percent(stats.numFF, sum) << "\n" - "Different " << percent(stats.numDifferent, sum) << "\n" - "Total " << sum << endl; + size_t sum = alignments.size() + stats.bothUnaligned + stats.oneUnaligned + numFR + numRF + + stats.numFF + stats.numDifferent; + cerr << "Mateless " << percent(alignments.size(), sum) + << "\n" + "Unaligned " + << percent(stats.bothUnaligned, sum) + << "\n" + "Singleton " + << percent(stats.oneUnaligned, sum) + << "\n" + "FR " + << percent(numFR, sum) + << "\n" + "RF " + << percent(numRF, sum) + << "\n" + "FF " + << percent(stats.numFF, sum) + << "\n" + "Different " + << percent(stats.numDifferent, sum) + << "\n" + "Total " + << sum << endl; if (!opt::db.empty()) { - vals = make_vector() - << alignments.size() - << stats.bothUnaligned - << stats.oneUnaligned - << numFR - << numRF - << stats.numFF - << stats.numDifferent - << sum; - - keys = make_vector() - << "Mateless" - << "Unaligned" - << "Singleton" - << "FR" - << "RF" - << "FF" - << "Different" - << "Total"; + vals = make_vector() << alignments.size() << stats.bothUnaligned << stats.oneUnaligned + << numFR << numRF << stats.numFF << stats.numDifferent << sum; + + keys = make_vector() << "Mateless" + << "Unaligned" + << "Singleton" + << "FR" + << "RF" + << "FF" + << "Different" + << "Total"; - for (unsigned i=0; i numFR && stats.numFF > numRF) { cerr << PROGRAM ": error: The mate pairs of this library are " - "oriented forward-forward (FF), which is not supported " - "by ABySS." - << endl; + "oriented forward-forward (FF), which is not supported " + "by ABySS." + << endl; exit(EXIT_FAILURE); } diff -Nru abyss-2.2.3/ParseAligns/ParseAligns.cpp abyss-2.2.4/ParseAligns/ParseAligns.cpp --- abyss-2.2.3/ParseAligns/ParseAligns.cpp 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/ParseAligns/ParseAligns.cpp 2020-01-30 18:48:44.000000000 +0000 @@ -27,67 +27,76 @@ #define PROGRAM "ParseAligns" static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Jared Simpson and Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Jared Simpson and Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... [FILE]...\n" -"Write pairs that map to the same contig to the file SAME.\n" -"Write pairs that map to different contigs to standard output.\n" -"Alignments may be read from FILE(s) or standard input.\n" -"\n" -" Options:\n" -"\n" -" -l, --min-align=N minimum alignment length\n" -" -d, --dist=DISTANCE write distance estimates to this file\n" -" -f, --frag=SAME write fragment sizes to this file\n" -" -h, --hist=FILE write the fragment size histogram to FILE\n" -" --sam alignments are in SAM format\n" -" --kaligner alignments are in KAligner format\n" -" -c, --cover=COVERAGE coverage cut-off for distance estimates\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... [FILE]...\n" + "Write pairs that map to the same contig to the file SAME.\n" + "Write pairs that map to different contigs to standard output.\n" + "Alignments may be read from FILE(s) or standard input.\n" + "\n" + " Options:\n" + "\n" + " -l, --min-align=N minimum alignment length\n" + " -d, --dist=DISTANCE write distance estimates to this file\n" + " -f, --frag=SAME write fragment sizes to this file\n" + " -h, --hist=FILE write the fragment size histogram to FILE\n" + " --sam alignments are in SAM format\n" + " --kaligner alignments are in KAligner format\n" + " -c, --cover=COVERAGE coverage cut-off for distance estimates\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by DistanceEst - static unsigned c; - static int verbose; - static string distPath; - static string fragPath; - static string histPath; - - /** Input alignment format. */ - static int inputFormat; - enum { KALIGNER, SAM }; +unsigned k; // used by DistanceEst +static unsigned c; +static int verbose; +static string distPath; +static string fragPath; +static string histPath; + +/** Input alignment format. */ +static int inputFormat; +enum +{ + KALIGNER, + SAM +}; - /** Output format */ - int format = ADJ; // used by Estimate +/** Output format */ +int format = ADJ; // used by Estimate } static const char shortopts[] = "d:l:f:h:c:v"; -enum { OPT_HELP = 1, OPT_VERSION }; +enum +{ + OPT_HELP = 1, + OPT_VERSION +}; static const struct option longopts[] = { - { "dist", required_argument, NULL, 'd' }, + { "dist", required_argument, NULL, 'd' }, { "min-align", required_argument, NULL, 'l' }, - { "frag", required_argument, NULL, 'f' }, - { "hist", required_argument, NULL, 'h' }, - { "kaligner",no_argument, &opt::inputFormat, opt::KALIGNER }, - { "sam", no_argument, &opt::inputFormat, opt::SAM }, - { "cover", required_argument, NULL, 'c' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, + { "frag", required_argument, NULL, 'f' }, + { "hist", required_argument, NULL, 'h' }, + { "kaligner", no_argument, &opt::inputFormat, opt::KALIGNER }, + { "sam", no_argument, &opt::inputFormat, opt::SAM }, + { "cover", required_argument, NULL, 'c' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, { NULL, 0, NULL, 0 } }; -static struct { +static struct +{ size_t alignments; size_t bothUnaligned; size_t oneUnaligned; @@ -109,14 +118,17 @@ typedef unordered_map EstimateMap; static EstimateMap estMap; -static bool checkUniqueAlignments(const AlignmentVector& alignVec); -static string makePairID(string id); +static bool +checkUniqueAlignments(const AlignmentVector& alignVec); +static string +makePairID(string id); /** * Return the size of the fragment demarcated by the specified * alignments. */ -static int fragmentSize(const Alignment& a0, const Alignment& a1) +static int +fragmentSize(const Alignment& a0, const Alignment& a1) { assert(a0.contig == a1.contig); assert(a0.isRC != a1.isRC); @@ -128,17 +140,16 @@ typedef pair Estimate; typedef vector Estimates; -static void addEstimate(EstimateMap& map, const Alignment& a, - Estimate& est, bool reverse) +static void +addEstimate(EstimateMap& map, const Alignment& a, Estimate& est, bool reverse) { - //count up the number of estimates that agree + // count up the number of estimates that agree bool placed = false; bool a_isRC = a.isRC != reverse; EstimateMap::iterator estimatesIt = map.find(a.contig); if (estimatesIt != map.end()) { Estimates& estimates = estimatesIt->second.estimates[a_isRC]; - for (Estimates::iterator estIt = estimates.begin(); - estIt != estimates.end(); ++estIt) { + for (Estimates::iterator estIt = estimates.begin(); estIt != estimates.end(); ++estIt) { if (estIt->first.id() == est.first.id()) { estIt->second.numPairs++; estIt->second.distance += est.second.distance; @@ -149,10 +160,10 @@ } if (!placed) map[a.contig].estimates[a_isRC].push_back(est); - } -static void doReadIntegrity(const ReadAlignMap::value_type& a) +static void +doReadIntegrity(const ReadAlignMap::value_type& a) { AlignmentVector::const_iterator refAlignIter = a.second.begin(); unsigned firstStart, lastEnd, largestSize; @@ -164,16 +175,14 @@ first = last = largest = *refAlignIter; ++refAlignIter; - //for each alignment in the vector a.second + // for each alignment in the vector a.second for (; refAlignIter != a.second.end(); ++refAlignIter) { if ((unsigned)refAlignIter->read_start_pos < firstStart) { firstStart = refAlignIter->read_start_pos; first = *refAlignIter; } - if ((unsigned)(refAlignIter->read_start_pos + - refAlignIter->align_length) > lastEnd) { - lastEnd = refAlignIter->read_start_pos + - refAlignIter->align_length; + if ((unsigned)(refAlignIter->read_start_pos + refAlignIter->align_length) > lastEnd) { + lastEnd = refAlignIter->read_start_pos + refAlignIter->align_length; last = *refAlignIter; } if ((unsigned)refAlignIter->align_length > largestSize) { @@ -184,27 +193,20 @@ if (largest.contig != last.contig) { Estimate est; - unsigned largest_end = - largest.read_start_pos + largest.align_length - opt::k; + unsigned largest_end = largest.read_start_pos + largest.align_length - opt::k; int distance = last.read_start_pos - largest_end; - est.first = find_vertex( - last.contig, largest.isRC != last.isRC, - g_contigNames); + est.first = find_vertex(last.contig, largest.isRC != last.isRC, g_contigNames); est.second.distance = distance - opt::k; est.second.numPairs = 1; est.second.stdDev = 0; addEstimate(estMap, largest, est, false); } - if (largest.contig != first.contig && - largest.contig != last.contig) { + if (largest.contig != first.contig && largest.contig != last.contig) { Estimate est; - unsigned first_end = - first.read_start_pos + first.align_length - opt::k; + unsigned first_end = first.read_start_pos + first.align_length - opt::k; int distance = last.read_start_pos - first_end; - est.first = find_vertex( - last.contig, first.isRC != last.isRC, - g_contigNames); + est.first = find_vertex(last.contig, first.isRC != last.isRC, g_contigNames); est.second.distance = distance - opt::k; est.second.numPairs = 1; est.second.stdDev = 0; @@ -215,12 +217,9 @@ largest.flipQuery(); first.flipQuery(); Estimate est; - unsigned largest_end = - largest.read_start_pos + largest.align_length - opt::k; + unsigned largest_end = largest.read_start_pos + largest.align_length - opt::k; int distance = first.read_start_pos - largest_end; - est.first = find_vertex( - first.contig, largest.isRC != first.isRC, - g_contigNames); + est.first = find_vertex(first.contig, largest.isRC != first.isRC, g_contigNames); est.second.distance = distance - opt::k; est.second.numPairs = 1; est.second.stdDev = 0; @@ -259,31 +258,27 @@ #endif } -static void generateDistFile() +static void +generateDistFile() { ofstream distFile(opt::distPath.c_str()); assert(distFile.is_open()); - for (EstimateMap::iterator mapIt = estMap.begin(); - mapIt != estMap.end(); ++mapIt) { - //Skip empty iterators + for (EstimateMap::iterator mapIt = estMap.begin(); mapIt != estMap.end(); ++mapIt) { + // Skip empty iterators assert(!mapIt->second.estimates[0].empty() || !mapIt->second.estimates[1].empty()); distFile << mapIt->first; for (int refIsRC = 0; refIsRC <= 1; refIsRC++) { if (refIsRC) distFile << " ;"; - for (Estimates::iterator vecIt - = mapIt->second.estimates[refIsRC].begin(); - vecIt != mapIt->second.estimates[refIsRC].end(); ++vecIt) { - vecIt->second.distance - = (int)round((double)vecIt->second.distance / - (double)vecIt->second.numPairs); - if (vecIt->second.numPairs >= opt::c - && vecIt->second.numPairs != 0 - /*&& vecIt->distance > 1 - opt::k*/) - distFile - << ' ' << get(g_contigNames, vecIt->first) - << ',' << vecIt->second; + for (Estimates::iterator vecIt = mapIt->second.estimates[refIsRC].begin(); + vecIt != mapIt->second.estimates[refIsRC].end(); + ++vecIt) { + vecIt->second.distance = + (int)round((double)vecIt->second.distance / (double)vecIt->second.numPairs); + if (vecIt->second.numPairs >= opt::c && vecIt->second.numPairs != 0 + /*&& vecIt->distance > 1 - opt::k*/) + distFile << ' ' << get(g_contigNames, vecIt->first) << ',' << vecIt->second; } } distFile << '\n'; @@ -292,8 +287,10 @@ distFile.close(); } -static bool isSingleEnd(const string& id); -static bool needsFlipping(const string& id); +static bool +isSingleEnd(const string& id); +static bool +needsFlipping(const string& id); /** * Return an alignment flipped as necessary to produce an alignment @@ -302,14 +299,14 @@ * alignment, so that the alignment is forward-reverse, which is * required by DistanceEst. */ -static const Alignment flipAlignment(const Alignment& a, - const string& id) +static const Alignment +flipAlignment(const Alignment& a, const string& id) { return needsFlipping(id) ? a.flipQuery() : a; } -static void handleAlignmentPair(const ReadAlignMap::value_type& curr, - const ReadAlignMap::value_type& pair) +static void +handleAlignmentPair(const ReadAlignMap::value_type& curr, const ReadAlignMap::value_type& pair) { const string& currID = curr.first; const string& pairID = pair.first; @@ -323,31 +320,24 @@ stats.bothUnaligned++; } else if (curr.second.empty() || pair.second.empty()) { stats.oneUnaligned++; - } else if (!checkUniqueAlignments(curr.second) - || !checkUniqueAlignments(pair.second)) { + } else if (!checkUniqueAlignments(curr.second) || !checkUniqueAlignments(pair.second)) { stats.numMulti++; - } else if (curr.second.size() > MAX_SPAN - && pair.second.size() > MAX_SPAN) { + } else if (curr.second.size() > MAX_SPAN && pair.second.size() > MAX_SPAN) { stats.numSplit++; } else { // Iterate over the vectors, outputting the aligments bool counted = false; - for (AlignmentVector::const_iterator refAlignIter - = curr.second.begin(); - refAlignIter != curr.second.end(); ++refAlignIter) { - for (AlignmentVector::const_iterator pairAlignIter - = pair.second.begin(); - pairAlignIter != pair.second.end(); - ++pairAlignIter) { - const Alignment& a0 = flipAlignment(*refAlignIter, - currID); - const Alignment& a1 = flipAlignment(*pairAlignIter, - pairID); + for (AlignmentVector::const_iterator refAlignIter = curr.second.begin(); + refAlignIter != curr.second.end(); + ++refAlignIter) { + for (AlignmentVector::const_iterator pairAlignIter = pair.second.begin(); + pairAlignIter != pair.second.end(); + ++pairAlignIter) { + const Alignment& a0 = flipAlignment(*refAlignIter, currID); + const Alignment& a1 = flipAlignment(*pairAlignIter, pairID); bool sameTarget = a0.contig == a1.contig; - if (sameTarget - && curr.second.size() == 1 - && pair.second.size() == 1) { + if (sameTarget && curr.second.size() == 1 && pair.second.size() == 1) { // Same target and the only alignment. if (a0.isRC != a1.isRC) { // Correctly oriented. Add this alignment to @@ -363,11 +353,9 @@ counted = true; } - bool outputSameTarget = opt::fragPath.empty() - && opt::histPath.empty(); + bool outputSameTarget = opt::fragPath.empty() && opt::histPath.empty(); if (!sameTarget || outputSameTarget) { - cout << SAMRecord(a0, a1) << '\n' - << SAMRecord(a1, a0) << '\n'; + cout << SAMRecord(a0, a1) << '\n' << SAMRecord(a1, a0) << '\n'; assert(cout.good()); } } @@ -377,7 +365,8 @@ } } -static void printProgress(const ReadAlignMap& map) +static void +printProgress(const ReadAlignMap& map) { if (opt::verbose == 0) return; @@ -390,16 +379,16 @@ if (stats.alignments % 1000000 == 0 || buckets != prevBuckets) { prevBuckets = buckets; size_t size = map.size(); - cerr << "Read " << stats.alignments << " alignments. " - "Hash load: " << size << " / " << buckets - << " = " << (float)size / buckets - << " using " << toSI(getMemoryUsage()) << "B." << endl; + cerr << "Read " << stats.alignments + << " alignments. " + "Hash load: " + << size << " / " << buckets << " = " << (float)size / buckets << " using " + << toSI(getMemoryUsage()) << "B." << endl; } } -static void handleAlignment( - const ReadAlignMap::value_type& alignments, - ReadAlignMap& out) +static void +handleAlignment(const ReadAlignMap::value_type& alignments, ReadAlignMap& out) { if (!isSingleEnd(alignments.first)) { string pairID = makePairID(alignments.first); @@ -408,8 +397,7 @@ handleAlignmentPair(*pairIter, alignments); out.erase(pairIter); } else if (!out.insert(alignments).second) { - cerr << "error: duplicate read ID `" << alignments.first - << "'\n"; + cerr << "error: duplicate read ID `" << alignments.first << "'\n"; exit(EXIT_FAILURE); } } @@ -421,13 +409,13 @@ printProgress(out); } -static void readAlignment(const string& line, ReadAlignMap& out) +static void +readAlignment(const string& line, ReadAlignMap& out) { istringstream s(line); pair v; switch (opt::inputFormat) { - case opt::SAM: - { + case opt::SAM: { SAMRecord sam; s >> sam; assert(s); @@ -439,23 +427,21 @@ if (!sam.isUnmapped()) v.second.push_back(sam); break; - } - case opt::KALIGNER: - { + } + case opt::KALIGNER: { s >> v.first; assert(s); v.second.reserve(count(line.begin(), line.end(), '\t')); - v.second.assign( - istream_iterator(s), - istream_iterator()); + v.second.assign(istream_iterator(s), istream_iterator()); assert(s.eof()); break; - } + } } handleAlignment(v, out); } -static void readAlignments(istream& in, ReadAlignMap* pout) +static void +readAlignments(istream& in, ReadAlignMap* pout) { for (string line; getline(in, line);) if (line.empty() || line[0] == '@') @@ -465,7 +451,8 @@ assert(in.eof()); } -static void readAlignmentsFile(string path, ReadAlignMap* pout) +static void +readAlignmentsFile(string path, ReadAlignMap* pout) { if (opt::verbose > 0) cerr << "Reading `" << path << "'..." << endl; @@ -476,51 +463,65 @@ } /** Return the specified number formatted as a percent. */ -static string percent(size_t x, size_t n) +static string +percent(size_t x, size_t n) { ostringstream ss; ss << setw((int)log10(n) + 1) << x; if (x > 0) - ss << " " << setprecision(3) << (float)100*x/n << '%'; + ss << " " << setprecision(3) << (float)100 * x / n << '%'; return ss.str(); } -int main(int argc, char* const* argv) +int +main(int argc, char* const* argv) { bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'l': arg >> opt::k; break; - case 'c': arg >> opt::c; break; - case 'd': arg >> opt::distPath; break; - case 'f': arg >> opt::fragPath; break; - case 'h': arg >> opt::histPath; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); + case '?': + die = true; + break; + case 'l': + arg >> opt::k; + break; + case 'c': + arg >> opt::c; + break; + case 'd': + arg >> opt::distPath; + break; + case 'f': + arg >> opt::fragPath; + break; + case 'h': + arg >> opt::histPath; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::k <= 0 && opt::inputFormat == opt::KALIGNER) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -531,8 +532,9 @@ ReadAlignMap alignTable(1); if (optind < argc) { - for_each(argv + optind, argv + argc, - bind2nd(ptr_fun(readAlignmentsFile), &alignTable)); + for_each(argv + optind, argv + argc, [&alignTable](const std::string& s) { + readAlignmentsFile(s, &alignTable); + }); } else { if (opt::verbose > 0) cerr << "Reading from standard input..." << endl; @@ -543,21 +545,36 @@ unsigned numRF = histogram.count(INT_MIN, 0); unsigned numFR = histogram.count(1, INT_MAX); - size_t sum = alignTable.size() - + stats.bothUnaligned + stats.oneUnaligned - + numFR + numRF + stats.numFF - + stats.numDifferent + stats.numMulti + stats.numSplit; - cerr << - "Mateless " << percent(alignTable.size(), sum) << "\n" - "Unaligned " << percent(stats.bothUnaligned, sum) << "\n" - "Singleton " << percent(stats.oneUnaligned, sum) << "\n" - "FR " << percent(numFR, sum) << "\n" - "RF " << percent(numRF, sum) << "\n" - "FF " << percent(stats.numFF, sum) << "\n" - "Different " << percent(stats.numDifferent, sum) << "\n" - "Multimap " << percent(stats.numMulti, sum) << "\n" - "Split " << percent(stats.numSplit, sum) << "\n" - "Total " << sum << endl; + size_t sum = alignTable.size() + stats.bothUnaligned + stats.oneUnaligned + numFR + numRF + + stats.numFF + stats.numDifferent + stats.numMulti + stats.numSplit; + cerr << "Mateless " << percent(alignTable.size(), sum) + << "\n" + "Unaligned " + << percent(stats.bothUnaligned, sum) + << "\n" + "Singleton " + << percent(stats.oneUnaligned, sum) + << "\n" + "FR " + << percent(numFR, sum) + << "\n" + "RF " + << percent(numRF, sum) + << "\n" + "FF " + << percent(stats.numFF, sum) + << "\n" + "Different " + << percent(stats.numDifferent, sum) + << "\n" + "Multimap " + << percent(stats.numMulti, sum) + << "\n" + "Split " + << percent(stats.numSplit, sum) + << "\n" + "Total " + << sum << endl; if (!opt::distPath.empty()) generateDistFile(); @@ -580,17 +597,25 @@ histogram.removeOutliers(); Histogram h = histogram.trimFraction(0.0001); if (opt::verbose > 0) - cerr << "Stats mean: " << setprecision(4) << h.mean() << " " - "median: " << setprecision(4) << h.median() << " " - "sd: " << setprecision(4) << h.sd() << " " - "n: " << h.size() << " " - "min: " << h.minimum() << " max: " << h.maximum() << '\n' - << h.barplot() << endl; + cerr << "Stats mean: " << setprecision(4) << h.mean() + << " " + "median: " + << setprecision(4) << h.median() + << " " + "sd: " + << setprecision(4) << h.sd() + << " " + "n: " + << h.size() + << " " + "min: " + << h.minimum() << " max: " << h.maximum() << '\n' + << h.barplot() << endl; if (stats.numFF > numFR && stats.numFF > numRF) { cerr << "error: The mate pairs of this library are oriented " - "forward-forward (FF), which is not supported by ABySS." - << endl; + "forward-forward (FF), which is not supported by ABySS." + << endl; exit(EXIT_FAILURE); } @@ -599,7 +624,8 @@ /** Return whether any k-mer in the query is aligned more than once. */ -static bool checkUniqueAlignments(const AlignmentVector& alignVec) +static bool +checkUniqueAlignments(const AlignmentVector& alignVec) { assert(!alignVec.empty()); if (alignVec.size() == 1) @@ -608,11 +634,9 @@ unsigned nKmer = alignVec.front().read_length - opt::k + 1; vector coverage(nKmer); - for (AlignmentVector::const_iterator iter = alignVec.begin(); - iter != alignVec.end(); ++iter) { + for (AlignmentVector::const_iterator iter = alignVec.begin(); iter != alignVec.end(); ++iter) { assert((unsigned)iter->align_length >= opt::k); - unsigned end = iter->read_start_pos - + iter->align_length - opt::k + 1; + unsigned end = iter->read_start_pos + iter->align_length - opt::k + 1; assert(end <= nKmer); for (unsigned i = iter->read_start_pos; i < end; i++) coverage[i]++; @@ -624,31 +648,30 @@ return true; } -static bool replaceSuffix(string& s, - const string& suffix0, const string& suffix1) +static bool +replaceSuffix(string& s, const string& suffix0, const string& suffix1) { if (endsWith(s, suffix0)) { - s.replace(s.length() - suffix0.length(), string::npos, - suffix1); + s.replace(s.length() - suffix0.length(), string::npos, suffix1); return true; } else if (endsWith(s, suffix1)) { - s.replace(s.length() - suffix1.length(), string::npos, - suffix0); + s.replace(s.length() - suffix1.length(), string::npos, suffix0); return true; } else return false; } /** Return true if the specified read ID is of a single-end read. */ -static bool isSingleEnd(const string& id) +static bool +isSingleEnd(const string& id) { unsigned l = id.length(); - return endsWith(id, ".fn") - || (l > 6 && id.substr(l-6, 5) == ".part"); + return endsWith(id, ".fn") || (l > 6 && id.substr(l - 6, 5) == ".part"); } /** Return the mate ID of the specified read ID. */ -static string makePairID(string id) +static string +makePairID(string id) { if (equal(id.begin(), id.begin() + 3, "SRR")) return id; @@ -656,27 +679,44 @@ assert(!id.empty()); char& c = id[id.length() - 1]; switch (c) { - case '1': c = '2'; return id; - case '2': c = '1'; return id; - case 'A': c = 'B'; return id; - case 'B': c = 'A'; return id; - case 'F': c = 'R'; return id; - case 'R': c = 'F'; return id; - case 'f': c = 'r'; return id; - case 'r': c = 'f'; return id; + case '1': + c = '2'; + return id; + case '2': + c = '1'; + return id; + case 'A': + c = 'B'; + return id; + case 'B': + c = 'A'; + return id; + case 'F': + c = 'R'; + return id; + case 'R': + c = 'F'; + return id; + case 'f': + c = 'r'; + return id; + case 'r': + c = 'f'; + return id; } - if (replaceSuffix(id, "forward", "reverse") - || replaceSuffix(id, "F3", "R3")) + if (replaceSuffix(id, "forward", "reverse") || replaceSuffix(id, "F3", "R3")) return id; - cerr << "error: read ID `" << id << "' must end in one of\n" - "\t1 and 2 or A and B or F and R or" - " F3 and R3 or forward and reverse\n"; + cerr << "error: read ID `" << id + << "' must end in one of\n" + "\t1 and 2 or A and B or F and R or" + " F3 and R3 or forward and reverse\n"; exit(EXIT_FAILURE); } -static bool needsFlipping(const string& id) +static bool +needsFlipping(const string& id) { return endsWith(id, "F3"); } diff -Nru abyss-2.2.3/PathOverlap/PathOverlap.cpp abyss-2.2.4/PathOverlap/PathOverlap.cpp --- abyss-2.2.3/PathOverlap/PathOverlap.cpp 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/PathOverlap/PathOverlap.cpp 2020-01-30 18:48:44.000000000 +0000 @@ -1,27 +1,27 @@ -#include "config.h" #include "ContigID.h" #include "ContigPath.h" #include "ContigProperties.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "Functional.h" -#include "IOUtil.h" -#include "Uncompress.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" +#include "IOUtil.h" +#include "Uncompress.h" +#include "config.h" #include #include #include -#include // for strerror #include -#include -#include +#include // for strerror #include +#include #include +#include #include #include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; @@ -29,116 +29,124 @@ DB db; -static const char *VERSION_MESSAGE = -PROGRAM " (ABySS) " VERSION "\n" -"Written by Shaun Jackman and Tony Raymond.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; - -static const char *USAGE_MESSAGE = -"Usage: " PROGRAM " -k [OPTION]... ADJ PATH\n" -"Find paths that overlap. Either output the graph of overlapping\n" -"paths, assemble overlapping paths into larger paths, or trim the\n" -"overlapping paths.\n" -"\n" -" Arguments:\n" -"\n" -" ADJ contig adjacency graph\n" -" PATH sequences of contig IDs\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=N k-mer size\n" -" -g, --graph=FILE write the contig adjacency graph to FILE\n" -" -r, --repeats=FILE write repeat contigs to FILE\n" -" --overlap find overlapping paths [default]\n" -" --assemble assemble overlapping paths\n" -" --trim trim overlapping paths\n" -" --adj output the graph in ADJ format [default]\n" -" --asqg output the graph in ASQG format\n" -" --dot output the graph in GraphViz format\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for sqlite\n" -" --strain=NAME specify strain NAME for sqlite\n" -" --species=NAME specify species NAME for sqlite\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; +static const char* VERSION_MESSAGE = + PROGRAM " (ABySS) " VERSION "\n" + "Written by Shaun Jackman and Tony Raymond.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + +static const char* USAGE_MESSAGE = + "Usage: " PROGRAM " -k [OPTION]... ADJ PATH\n" + "Find paths that overlap. Either output the graph of overlapping\n" + "paths, assemble overlapping paths into larger paths, or trim the\n" + "overlapping paths.\n" + "\n" + " Arguments:\n" + "\n" + " ADJ contig adjacency graph\n" + " PATH sequences of contig IDs\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=N k-mer size\n" + " -g, --graph=FILE write the contig adjacency graph to FILE\n" + " -r, --repeats=FILE write repeat contigs to FILE\n" + " --overlap find overlapping paths [default]\n" + " --assemble assemble overlapping paths\n" + " --trim trim overlapping paths\n" + " --adj output the graph in ADJ format [default]\n" + " --asqg output the graph in ASQG format\n" + " --dot output the graph in GraphViz format\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for sqlite\n" + " --strain=NAME specify strain NAME for sqlite\n" + " --species=NAME specify species NAME for sqlite\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - unsigned k; - - /** Output format. */ - int format; // used by ContigProperties - - /** Write the contig adjacency graph to this file. */ - static string graphPath; - - /** Output the IDs of contigs in overlaps to this file. */ - static string repeatContigs; - - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; - - /** Mode of operation. */ - enum { - /** Find overlapping paths, do not assemble. */ - OVERLAP, - /** Assemble overlapping paths. */ - ASSEMBLE, - /** Trim overlapping paths. */ - TRIM, - }; - static int mode; +string db; +dbVars metaVars; +unsigned k; + +/** Output format. */ +int format; // used by ContigProperties + +/** Write the contig adjacency graph to this file. */ +static string graphPath; + +/** Output the IDs of contigs in overlaps to this file. */ +static string repeatContigs; + +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; + +/** Mode of operation. */ +enum +{ + /** Find overlapping paths, do not assemble. */ + OVERLAP, + /** Assemble overlapping paths. */ + ASSEMBLE, + /** Trim overlapping paths. */ + TRIM, +}; +static int mode; - static int verbose; +static int verbose; } static const char* shortopts = "g:k:r:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; -//enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "graph", required_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, - { "assemble", no_argument, &opt::mode, opt::ASSEMBLE }, - { "overlap", no_argument, &opt::mode, opt::OVERLAP }, - { "trim", no_argument, &opt::mode, opt::TRIM }, - { "adj", no_argument, &opt::format, ADJ }, - { "asqg", no_argument, &opt::format, ASQG }, - { "dot", no_argument, &opt::format, DOT }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "repeats", required_argument, NULL, 'r' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; +// enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, + { "assemble", no_argument, &opt::mode, opt::ASSEMBLE }, + { "overlap", no_argument, &opt::mode, opt::OVERLAP }, + { "trim", no_argument, &opt::mode, opt::TRIM }, + { "adj", no_argument, &opt::format, ADJ }, + { "asqg", no_argument, &opt::format, ASQG }, + { "dot", no_argument, &opt::format, DOT }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "repeats", required_argument, NULL, 'r' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; /** A vertex of the overlap graph. */ -struct Vertex { +struct Vertex +{ unsigned id; bool sense; @@ -146,23 +154,20 @@ static unsigned s_offset; Vertex(unsigned id, bool sense) - : id(id), sense(sense) { } + : id(id) + , sense(sense) + {} - bool operator ==(const Vertex& v) const - { - return id == v.id && sense == v.sense; - } + bool operator==(const Vertex& v) const { return id == v.id && sense == v.sense; } - ContigNode descriptor() const - { - return ContigNode(s_offset + id, sense); - } + ContigNode descriptor() const { return ContigNode(s_offset + id, sense); } }; unsigned Vertex::s_offset; /** An alignment of two overlapping contigs. */ -struct Overlap { +struct Overlap +{ Vertex source; Vertex target; @@ -172,10 +177,12 @@ /** Overlap measured in bp. */ int distance; - Overlap(const Vertex& source, const Vertex& target, - unsigned overlap, int distance) - : source(source), target(target), - overlap(overlap), distance(distance) { } + Overlap(const Vertex& source, const Vertex& target, unsigned overlap, int distance) + : source(source) + , target(target) + , overlap(overlap) + , distance(distance) + {} }; /** The contig IDs that have been removed from paths. */ @@ -188,13 +195,15 @@ typedef vector Paths; /** Return whether this vertex is a path or a contig. */ -static bool isPath(const ContigNode& u) +static bool +isPath(const ContigNode& u) { return u.id() >= Vertex::s_offset; } /** Return a path, complemented if necessary. */ -static ContigPath getPath(const Paths& paths, const ContigNode& u) +static ContigPath +getPath(const Paths& paths, const ContigNode& u) { if (isPath(u)) { unsigned i = u.id() - Vertex::s_offset; @@ -209,8 +218,8 @@ * @param[out] pathIDs the path IDs * @return the paths */ -static Paths readPaths(Graph& g, - const string& inPath, vector& pathIDs) +static Paths +readPaths(Graph& g, const string& inPath, vector& pathIDs) { typedef graph_traits::vertex_descriptor V; @@ -245,42 +254,45 @@ /** Index the first and last contig of each path to facilitate finding * overlaps between paths. */ -static SeedMap makeSeedMap(const Paths& paths) +static SeedMap +makeSeedMap(const Paths& paths) { SeedMap seedMap; - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) continue; assert(!it->front().ambiguous()); - seedMap.insert(make_pair(it->front(), - Vertex(it - paths.begin(), false))); + seedMap.insert(make_pair(it->front(), Vertex(it - paths.begin(), false))); assert(!it->back().ambiguous()); - seedMap.insert(make_pair(it->back() ^ 1, - Vertex(it - paths.begin(), true))); + seedMap.insert(make_pair(it->back() ^ 1, Vertex(it - paths.begin(), true))); } return seedMap; } /** Check whether path starts with the sequence [first, last). */ -static bool startsWith(ContigPath path, bool rc, - ContigPath::const_iterator first, - ContigPath::const_iterator last) +static bool +startsWith( + ContigPath path, + bool rc, + ContigPath::const_iterator first, + ContigPath::const_iterator last) { if (rc) reverseComplement(path.begin(), path.end()); assert(*first == path.front()); assert(first < last); - return unsigned(last - first) > path.size() ? false - : equal(first, last, path.begin()); + return unsigned(last - first) > path.size() ? false : equal(first, last, path.begin()); } /** Check whether path starts with the sequence [first, last). */ -static unsigned findOverlap(const Graph& g, - const Paths& paths, - ContigPath::const_iterator first, - ContigPath::const_iterator last, - const Vertex& v, int &distance) +static unsigned +findOverlap( + const Graph& g, + const Paths& paths, + ContigPath::const_iterator first, + ContigPath::const_iterator last, + const Vertex& v, + int& distance) { if (!startsWith(paths[v.id], v.sense, first, last)) return 0; @@ -291,9 +303,13 @@ typedef vector Overlaps; /** Find every path that overlaps with the specified path. */ -static void findOverlaps(const Graph& g, - const Paths& paths, const SeedMap& seedMap, - const Vertex& v, Overlaps& overlaps) +static void +findOverlaps( + const Graph& g, + const Paths& paths, + const SeedMap& seedMap, + const Vertex& v, + Overlaps& overlaps) { ContigPath rc; if (v.sense) { @@ -302,36 +318,30 @@ } const ContigPath& path = v.sense ? rc : paths[v.id]; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { if (it->ambiguous()) continue; - pair - range = seedMap.equal_range(*it); - for (SeedMap::const_iterator seed = range.first; - seed != range.second; ++seed) { + pair range = seedMap.equal_range(*it); + for (SeedMap::const_iterator seed = range.first; seed != range.second; ++seed) { if (v == seed->second) continue; int distance = 0; - unsigned overlap = findOverlap(g, paths, it, path.end(), - seed->second, distance); + unsigned overlap = findOverlap(g, paths, it, path.end(), seed->second, distance); if (overlap > 0) - overlaps.push_back(Overlap(v, seed->second, - overlap, distance)); - + overlaps.push_back(Overlap(v, seed->second, overlap, distance)); } } } /** Find every pair of overlapping paths. */ -static Overlaps findOverlaps(const Graph& g, const Paths& paths) +static Overlaps +findOverlaps(const Graph& g, const Paths& paths) { SeedMap seedMap = makeSeedMap(paths); Overlaps overlaps; - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { unsigned i = it - paths.begin(); findOverlaps(g, paths, seedMap, Vertex(i, false), overlaps); findOverlaps(g, paths, seedMap, Vertex(i, true), overlaps); @@ -340,9 +350,8 @@ } /** Record the trimmed contigs. */ -static void recordTrimmedContigs( - ContigPath::const_iterator first, - ContigPath::const_iterator last) +static void +recordTrimmedContigs(ContigPath::const_iterator first, ContigPath::const_iterator last) { for (ContigPath::const_iterator it = first; it != last; ++it) if (!it->ambiguous()) @@ -350,7 +359,8 @@ } /** Remove ambiguous contigs from the ends of the path. */ -static void removeAmbiguousContigs(ContigPath& path) +static void +removeAmbiguousContigs(ContigPath& path) { if (!path.empty() && path.back().ambiguous()) path.erase(path.end() - 1); @@ -359,8 +369,8 @@ } /** Remove the overlapping portion of the specified contig. */ -static void removeContigs(ContigPath& path, - unsigned first, unsigned last) +static void +removeContigs(ContigPath& path, unsigned first, unsigned last) { assert(first <= path.size()); assert(last <= path.size()); @@ -377,14 +387,14 @@ } /** Find the largest overlap for each contig and remove it. */ -static void trimOverlaps(Paths& paths, const Overlaps& overlaps) +static void +trimOverlaps(Paths& paths, const Overlaps& overlaps) { vector removed[2]; removed[0].resize(paths.size()); removed[1].resize(paths.size()); - for (Overlaps::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) { + for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) { unsigned& a = removed[!it->source.sense][it->source.id]; unsigned& b = removed[it->target.sense][it->target.id]; a = max(a, it->overlap); @@ -392,41 +402,41 @@ } for (Paths::iterator it = paths.begin(); it != paths.end(); ++it) - removeContigs(*it, removed[0][it - paths.begin()], - it->size() - removed[1][it - paths.begin()]); + removeContigs( + *it, removed[0][it - paths.begin()], it->size() - removed[1][it - paths.begin()]); } /** Trim the ends of paths that overlap another path. */ -static void trimOverlaps(const Graph& g, Paths& paths) +static void +trimOverlaps(const Graph& g, Paths& paths) { - for (Overlaps overlaps = findOverlaps(g, paths); - !overlaps.empty(); - overlaps = findOverlaps(g, paths)) { + for (Overlaps overlaps = findOverlaps(g, paths); !overlaps.empty(); + overlaps = findOverlaps(g, paths)) { cerr << "Found " << overlaps.size() / 2 << " overlaps.\n"; trimOverlaps(paths, overlaps); } } -static inline -ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u) +static inline ContigProperties +get(vertex_bundle_t, const Graph& g, ContigNode u) { - return u.ambiguous() - ? ContigProperties(u.length() + opt::k - 1, 0) - : g[u]; + return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u]; } /** Add the path overlap edges to the specified graph. */ -static void addPathOverlapEdges(Graph& g, - const Paths& paths, const vector& pathIDs, - const Overlaps& overlaps) +static void +addPathOverlapEdges( + Graph& g, + const Paths& paths, + const vector& pathIDs, + const Overlaps& overlaps) { typedef graph_traits::vertex_descriptor V; const bool allowParallelEdge = opt::mode == opt::ASSEMBLE; // Add the path vertices. g_contigNames.unlock(); - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (!path.empty()) { @@ -437,21 +447,19 @@ g_contigNames.lock(); // Remove the single-end contigs that are in paths. - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) - remove_vertex_if(g, it->begin(), it->end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) + remove_vertex_if( + g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); }); // Add the path edges. - for (Overlaps::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) { + for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) { V u = it->source.descriptor(); V v = it->target.descriptor(); if (allowParallelEdge || !edge(u, v, g).second) add_edge(u, v, it->distance, static_cast(g)); else if (opt::verbose > 0) - cerr << "ambiguous overlap: " << get(vertex_name, g, u) - << " -> " << get(vertex_name, g, v) << '\n'; + cerr << "ambiguous overlap: " << get(vertex_name, g, u) << " -> " + << get(vertex_name, g, v) << '\n'; } } @@ -462,14 +470,15 @@ typedef map OverlapMap; /** Return the number of contigs by which the two paths overlap. */ -static unsigned getOverlap(const OverlapMap& pmap, - graph_traits::vertex_descriptor u, - graph_traits::vertex_descriptor v) +static unsigned +getOverlap( + const OverlapMap& pmap, + graph_traits::vertex_descriptor u, + graph_traits::vertex_descriptor v) { if (isPath(u) && isPath(v)) { // Both vertices are paths. - OverlapMap::const_iterator it = pmap.find( - edge_descriptor(u, v)); + OverlapMap::const_iterator it = pmap.find(edge_descriptor(u, v)); return it == pmap.end() ? 0 : it->second; } else { // One of the two vertices is a contig. @@ -478,21 +487,19 @@ } /** Merge a sequence of overlapping paths. */ -static ContigPath mergePaths(const Paths& paths, - const OverlapMap& overlaps, const ContigPath& merge) +static ContigPath +mergePaths(const Paths& paths, const OverlapMap& overlaps, const ContigPath& merge) { assert(!merge.empty()); ContigNode u = merge.front(); ContigPath path(getPath(paths, u)); - for (ContigPath::const_iterator it = merge.begin() + 1; - it != merge.end(); ++it) { + for (ContigPath::const_iterator it = merge.begin() + 1; it != merge.end(); ++it) { ContigNode v = *it; ContigPath vpath(getPath(paths, v)); unsigned overlap = getOverlap(overlaps, u, v); assert(path.size() > overlap); assert(vpath.size() > overlap); - assert(equal(path.end() - overlap, path.end(), - vpath.begin())); + assert(equal(path.end() - overlap, path.end(), vpath.begin())); path.insert(path.end(), vpath.begin() + overlap, vpath.end()); u = v; } @@ -500,18 +507,21 @@ } /** Return true if the edge e is a path overlap. */ -struct IsPathOverlap : unary_function { - IsPathOverlap(const Graph& g, const OverlapMap& pmap, - const IsPositive& pred) - : m_g(g), m_pmap(pmap), m_isPositive(pred) { } +struct IsPathOverlap : unary_function +{ + IsPathOverlap(const Graph& g, const OverlapMap& pmap, const IsPositive& pred) + : m_g(g) + , m_pmap(pmap) + , m_isPositive(pred) + {} bool operator()(edge_descriptor e) const { bool stranded = true; if (opt::ss) stranded = m_isPositive(e); - return stranded && - getOverlap(m_pmap, source(e, m_g), target(e, m_g)); + return stranded && getOverlap(m_pmap, source(e, m_g), target(e, m_g)); } + private: const Graph& m_g; const OverlapMap& m_pmap; @@ -519,8 +529,8 @@ }; /** Assemble overlapping paths. */ -static void assembleOverlappingPaths(Graph& g, - Paths& paths, vector& pathIDs) +static void +assembleOverlappingPaths(Graph& g, Paths& paths, vector& pathIDs) { if (paths.empty()) return; @@ -531,25 +541,19 @@ // Create a property map of path overlaps. OverlapMap overlapMap; - for (Overlaps::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) + for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) overlapMap.insert(OverlapMap::value_type( - OverlapMap::key_type( - it->source.descriptor(), - it->target.descriptor()), - it->overlap)); + OverlapMap::key_type(it->source.descriptor(), it->target.descriptor()), it->overlap)); // Assemble unambiguously overlapping paths. Paths merges; - assemble_if(g, back_inserter(merges), - IsPathOverlap(g, overlapMap, IsPositive(g))); + assemble_if(g, back_inserter(merges), IsPathOverlap(g, overlapMap, IsPositive(g))); // Merge overlapping paths. g_contigNames.unlock(); assert(!pathIDs.empty()); setNextContigName(pathIDs.back()); - for (Paths::const_iterator it = merges.begin(); - it != merges.end(); ++it) { + for (Paths::const_iterator it = merges.begin(); it != merges.end(); ++it) { string name = createContigName(); if (opt::verbose > 0) cerr << name << '\t' << *it << '\n'; @@ -559,8 +563,7 @@ paths.push_back(mergePaths(paths, overlapMap, *it)); // Remove the merged paths. - for (ContigPath::const_iterator it2 = it->begin(); - it2 != it->end(); ++it2) { + for (ContigPath::const_iterator it2 = it->begin(); it2 != it->end(); ++it2) { if (isPath(*it2)) paths[it2->id() - Vertex::s_offset].clear(); } @@ -568,13 +571,14 @@ g_contigNames.lock(); } -int main(int argc, char** argv) +int +main(int argc, char** argv) { string commandLine; { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } @@ -583,33 +587,45 @@ opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'g': arg >> opt::graphPath; break; - case 'k': arg >> opt::k; break; - case 'r': arg >> opt::repeatContigs; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; break; + case '?': + die = true; + break; + case 'g': + arg >> opt::graphPath; + break; + case 'k': + arg >> opt::k; + break; + case 'r': + arg >> opt::repeatContigs; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -628,12 +644,11 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } - const char *adjPath = argv[optind++]; + const char* adjPath = argv[optind++]; if (opt::verbose > 0) cerr << "Reading `" << adjPath << "'..." << endl; ifstream fin(adjPath); @@ -647,37 +662,35 @@ Paths paths = readPaths(g, pathsFile, pathIDs); switch (opt::mode) { - case opt::OVERLAP: + case opt::OVERLAP: // Find overlapping paths, do not assemble. - addPathOverlapEdges(g, paths, pathIDs, - findOverlaps(g, paths)); + addPathOverlapEdges(g, paths, pathIDs, findOverlaps(g, paths)); paths.clear(); if (opt::graphPath.empty()) opt::graphPath = "-"; break; - case opt::ASSEMBLE: + case opt::ASSEMBLE: // Assemble overlapping paths. assembleOverlappingPaths(g, paths, pathIDs); break; - case opt::TRIM: + case opt::TRIM: // Trim overlapping paths. trimOverlaps(g, paths); // Remove paths consisting of a single contig. - for_each_if(paths.begin(), paths.end(), - mem_fun_ref(&ContigPath::clear), - compose1( - bind2nd(equal_to(), 1), - mem_fun_ref(&ContigPath::size))); + for_each_if( + paths.begin(), + paths.end(), + [](ContigPath& c) { return c.clear(); }, + [](const ContigPath& c) { return c.size() == 1; }); // Add the paths to the graph. addPathOverlapEdges(g, paths, pathIDs, Overlaps()); break; } // Output the paths. - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) continue; assert(it->size() != 1); @@ -688,8 +701,7 @@ // Output the graph. if (!opt::graphPath.empty()) { ofstream fout; - ostream& out = opt::graphPath == "-" ? cout - : (fout.open(opt::graphPath.c_str()), fout); + ostream& out = opt::graphPath == "-" ? cout : (fout.open(opt::graphPath.c_str()), fout); assert_good(out, opt::graphPath); write_graph(out, g, PROGRAM, commandLine); assert_good(out, opt::graphPath); @@ -699,24 +711,18 @@ if (!opt::repeatContigs.empty()) { sort(s_trimmedContigs.begin(), s_trimmedContigs.end()); s_trimmedContigs.erase( - unique(s_trimmedContigs.begin(), - s_trimmedContigs.end()), s_trimmedContigs.end()); + unique(s_trimmedContigs.begin(), s_trimmedContigs.end()), s_trimmedContigs.end()); ofstream out(opt::repeatContigs.c_str()); assert_good(out, opt::repeatContigs); - for (vector::const_iterator it - = s_trimmedContigs.begin(); - it != s_trimmedContigs.end(); ++it) + for (vector::const_iterator it = s_trimmedContigs.begin(); + it != s_trimmedContigs.end(); + ++it) out << get(g_contigNames, *it) << '\n'; assert_good(out, opt::repeatContigs); } if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); addToDb(db, "SS", opt::ss); addToDb(db, "K", opt::k); } diff -Nru abyss-2.2.3/PopBubbles/PopBubbles.cpp abyss-2.2.4/PopBubbles/PopBubbles.cpp --- abyss-2.2.3/PopBubbles/PopBubbles.cpp 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/PopBubbles/PopBubbles.cpp 2020-01-30 18:48:44.000000000 +0000 @@ -3,40 +3,40 @@ * Written by Shaun Jackman . */ -#include "config.h" +#include "Graph/PopBubbles.h" #include "Common/Options.h" #include "ConstString.h" #include "ContigPath.h" #include "ContigProperties.h" #include "FastaReader.h" -#include "IOUtil.h" -#include "Sequence.h" -#include "Uncompress.h" -#include "alignGlobal.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DepthFirstSearch.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" -#include "Graph/PopBubbles.h" +#include "IOUtil.h" +#include "Sequence.h" +#include "Uncompress.h" +#include "alignGlobal.h" +#include "config.h" +#include #include #include -#include #include // for UINT_MAX #include #include #include -#include #include #include +#include #include #include #include #include #include #if _OPENMP -# include +#include #endif using namespace std; @@ -46,126 +46,133 @@ #define PROGRAM "PopBubbles" static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... FASTA ADJ\n" -"Identify and pop simple bubbles.\n" -"\n" -" Arguments:\n" -"\n" -" FASTA contigs in FASTA format\n" -" ADJ contig adjacency graph\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=N k-mer size\n" -" -a, --branches=N maximum number of branches, default: 2\n" -" -b, --bubble-length=N pop bubbles shorter than N bp\n" -" default is 10000\n" -" -p, --identity=REAL minimum identity, default: 0.9\n" -" -c, --coverage=REAL remove contigs with mean k-mer coverage\n" -" less than this threshold [0]\n" -" --scaffold scaffold over bubbles that have\n" -" insufficient identity\n" -" --no-scaffold disable scaffolding [default]\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -g, --graph=FILE write the contig adjacency graph to FILE\n" -" --adj output the graph in ADJ format [default]\n" -" --asqg output the graph in ASQG format\n" -" --dot output the graph in GraphViz format\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" --bubble-graph output a graph of the bubbles\n" -" -j, --threads=N use N parallel threads [1]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... FASTA ADJ\n" + "Identify and pop simple bubbles.\n" + "\n" + " Arguments:\n" + "\n" + " FASTA contigs in FASTA format\n" + " ADJ contig adjacency graph\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=N k-mer size\n" + " -a, --branches=N maximum number of branches, default: 2\n" + " -b, --bubble-length=N pop bubbles shorter than N bp\n" + " default is 10000\n" + " -p, --identity=REAL minimum identity, default: 0.9\n" + " -c, --coverage=REAL remove contigs with mean k-mer coverage\n" + " less than this threshold [0]\n" + " --scaffold scaffold over bubbles that have\n" + " insufficient identity\n" + " --no-scaffold disable scaffolding [default]\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -g, --graph=FILE write the contig adjacency graph to FILE\n" + " --adj output the graph in ADJ format [default]\n" + " --asqg output the graph in ASQG format\n" + " --dot output the graph in GraphViz format\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " --bubble-graph output a graph of the bubbles\n" + " -j, --threads=N use N parallel threads [1]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Maximum number of branches. */ - static unsigned maxBranches = 2; +/** Maximum number of branches. */ +static unsigned maxBranches = 2; - /** Pop bubbles shorter than this threshold. */ - static unsigned maxLength = 10000; +/** Pop bubbles shorter than this threshold. */ +static unsigned maxLength = 10000; - /** Minimum identity. */ - static float identity = 0.9; +/** Minimum identity. */ +static float identity = 0.9; - /** Minimum mean k-mer coverage. */ - static float minCoverage; +/** Minimum mean k-mer coverage. */ +static float minCoverage; - /** Scaffold over bubbles that have insufficient identity. */ - static int scaffold; +/** Scaffold over bubbles that have insufficient identity. */ +static int scaffold; - /** Write the contig adjacency graph to this file. */ - static string graphPath; +/** Write the contig adjacency graph to this file. */ +static string graphPath; - /** Output a graph of the bubbles. */ - static int bubbleGraph; +/** Output a graph of the bubbles. */ +static int bubbleGraph; - int format; // used by ContigProperties +int format; // used by ContigProperties - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Number of threads. */ - static int threads = 1; +/** Number of threads. */ +static int threads = 1; } static const char shortopts[] = "a:b:c:g:j:k:p:v"; -enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "branches", required_argument, NULL, 'a' }, - { "bubble-length", required_argument, NULL, 'b' }, - { "coverage", required_argument, NULL, 'c' }, - { "bubble-graph", no_argument, &opt::bubbleGraph, 1, }, - { "graph", required_argument, NULL, 'g' }, - { "adj", no_argument, &opt::format, ADJ }, - { "asqg", no_argument, &opt::format, ASQG }, - { "dot", no_argument, &opt::format, DOT }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "kmer", required_argument, NULL, 'k' }, - { "identity", required_argument, NULL, 'p' }, - { "scaffold", no_argument, &opt::scaffold, 1}, - { "no-scaffold", no_argument, &opt::scaffold, 0}, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "threads", required_argument, NULL, 'j' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION }; +static const struct option longopts[] = { { "branches", required_argument, NULL, 'a' }, + { "bubble-length", required_argument, NULL, 'b' }, + { "coverage", required_argument, NULL, 'c' }, + { + "bubble-graph", + no_argument, + &opt::bubbleGraph, + 1, + }, + { "graph", required_argument, NULL, 'g' }, + { "adj", no_argument, &opt::format, ADJ }, + { "asqg", no_argument, &opt::format, ASQG }, + { "dot", no_argument, &opt::format, DOT }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "kmer", required_argument, NULL, 'k' }, + { "identity", required_argument, NULL, 'p' }, + { "scaffold", no_argument, &opt::scaffold, 1 }, + { "no-scaffold", no_argument, &opt::scaffold, 0 }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "threads", required_argument, NULL, 'j' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { NULL, 0, NULL, 0 } }; + /** Popped branches. */ static vector g_popped; /** Contig adjacency graph. */ -typedef ContigGraph > Graph; +typedef ContigGraph> Graph; typedef Graph::vertex_descriptor vertex_descriptor; typedef Graph::adjacency_iterator adjacency_iterator; /** Return the distance from vertex u to v. */ -static int getDistance(const Graph& g, - vertex_descriptor u, vertex_descriptor v) +static int +getDistance(const Graph& g, vertex_descriptor u, vertex_descriptor v) { typedef graph_traits::edge_descriptor edge_descriptor; pair e = edge(u, v, g); @@ -173,9 +180,12 @@ return g[e.first].distance; } -struct CompareCoverage { +struct CompareCoverage +{ const Graph& g; - CompareCoverage(const Graph& g) : g(g) { } + CompareCoverage(const Graph& g) + : g(g) + {} bool operator()(vertex_descriptor u, vertex_descriptor v) { return g[u].coverage > g[v].coverage; @@ -183,33 +193,33 @@ }; /** Pop the bubble between vertices v and tail. */ -static void popBubble(Graph& g, - vertex_descriptor v, vertex_descriptor tail) +static void +popBubble(Graph& g, vertex_descriptor v, vertex_descriptor tail) { unsigned nbranches = g.out_degree(v); assert(nbranches > 1); assert(nbranches == g.in_degree(tail)); vector sorted(nbranches); - pair - adj = g.adjacent_vertices(v); + pair adj = g.adjacent_vertices(v); copy(adj.first, adj.second, sorted.begin()); sort(sorted.begin(), sorted.end(), CompareCoverage(g)); if (opt::bubbleGraph) #pragma omp critical(cout) { cout << '"' << get(vertex_name, g, v) << "\" -> {"; - for (vector::const_iterator - it = sorted.begin(); it != sorted.end(); ++it) + for (vector::const_iterator it = sorted.begin(); it != sorted.end(); + ++it) cout << " \"" << get(vertex_name, g, *it) << '"'; cout << " } -> \"" << get(vertex_name, g, tail) << "\"\n"; } #pragma omp critical(g_popped) - transform(sorted.begin() + 1, sorted.end(), - back_inserter(g_popped), - mem_fun_ref(&ContigNode::contigIndex)); + transform(sorted.begin() + 1, sorted.end(), back_inserter(g_popped), [](const ContigNode& c) { + return c.contigIndex(); + }); } -static struct { +static struct +{ unsigned bubbles; unsigned popped; unsigned scaffold; @@ -224,7 +234,8 @@ static Contigs g_contigs; /** Return the sequence of vertex u. */ -static string getSequence(const Graph* g, vertex_descriptor u) +static string +getSequence(const Graph* g, vertex_descriptor u) { size_t i = get(vertex_contig_index, *g, u); assert(i < g_contigs.size()); @@ -233,7 +244,8 @@ } /** Return the length of vertex v. */ -static unsigned getLength(const Graph* g, vertex_descriptor v) +static unsigned +getLength(const Graph* g, vertex_descriptor v) { return (*g)[v].length; } @@ -243,38 +255,35 @@ * @param v the vertex to the right of the bubble * @return the identity of the global alignment */ -template -static float getAlignmentIdentity(const Graph& g, - vertex_descriptor t, vertex_descriptor v, - It first, It last) +template +static float +getAlignmentIdentity(const Graph& g, vertex_descriptor t, vertex_descriptor v, It first, It last) { unsigned nbranches = distance(first, last); vector inDists(nbranches); - transform(first, last, inDists.begin(), - boost::lambda::bind(getDistance, boost::cref(g), t, _1)); + transform( + first, last, inDists.begin(), boost::lambda::bind(getDistance, boost::cref(g), t, _1)); vector outDists(nbranches); - transform(first, last, outDists.begin(), - boost::lambda::bind(getDistance, boost::cref(g), _1, v)); + transform( + first, last, outDists.begin(), boost::lambda::bind(getDistance, boost::cref(g), _1, v)); vector insertLens(nbranches); - transform(first, last, insertLens.begin(), - boost::lambda::bind(getDistance, boost::cref(g), t, _1) - + boost::lambda::bind(getLength, &g, _1) - + boost::lambda::bind(getDistance, boost::cref(g), _1, v)); + transform( + first, + last, + insertLens.begin(), + boost::lambda::bind(getDistance, boost::cref(g), t, _1) + + boost::lambda::bind(getLength, &g, _1) + + boost::lambda::bind(getDistance, boost::cref(g), _1, v)); - int max_in_overlap = -(*min_element(inDists.begin(), - inDists.end())); + int max_in_overlap = -(*min_element(inDists.begin(), inDists.end())); assert(max_in_overlap >= 0); - int max_out_overlap = -(*min_element(outDists.begin(), - outDists.end())); + int max_out_overlap = -(*min_element(outDists.begin(), outDists.end())); assert(max_out_overlap >= 0); - int min_insert_len = *min_element(insertLens.begin(), - insertLens.end()); - int max_insert_len = *max_element(insertLens.begin(), - insertLens.end()); - - float max_identity = - (float)(min_insert_len + max_in_overlap + max_out_overlap) / - (max_insert_len + max_in_overlap + max_out_overlap); + int min_insert_len = *min_element(insertLens.begin(), insertLens.end()); + int max_insert_len = *max_element(insertLens.begin(), insertLens.end()); + + float max_identity = (float)(min_insert_len + max_in_overlap + max_out_overlap) / + (max_insert_len + max_in_overlap + max_out_overlap); if (min_insert_len <= 0 || max_identity < opt::identity) return max_identity; @@ -291,13 +300,14 @@ unsigned matches, consensusSize; tie(matches, consensusSize) = align(seqs); return (float)(matches + max_in_overlap + max_out_overlap) / - (consensusSize + max_in_overlap + max_out_overlap); + (consensusSize + max_in_overlap + max_out_overlap); } /** Pop the specified bubble if it is a simple bubble. * @return whether the bubble is popped */ -static bool popSimpleBubble(Graph* pg, vertex_descriptor v) +static bool +popSimpleBubble(Graph* pg, vertex_descriptor v) { Graph& g = *pg; unsigned nbranches = g.out_degree(v); @@ -310,15 +320,14 @@ } vertex_descriptor tail = *g.adjacent_vertices(v1).first; if (v == get(vertex_complement, g, tail) // Palindrome - || g.in_degree(tail) != nbranches) { + || g.in_degree(tail) != nbranches) { #pragma omp atomic g_count.notSimple++; return false; } // Check that every branch is simple and ends at the same node. - pair - adj = g.adjacent_vertices(v); + pair adj = g.adjacent_vertices(v); for (adjacency_iterator it = adj.first; it != adj.second; ++it) { if (g.out_degree(*it) != 1 || g.in_degree(*it) != 1) { #pragma omp atomic @@ -337,8 +346,7 @@ #pragma omp critical(cerr) { cerr << "\n* " << get(vertex_name, g, v) << " ->"; - for (adjacency_iterator it = adj.first; - it != adj.second; ++it) + for (adjacency_iterator it = adj.first; it != adj.second; ++it) cerr << ' ' << get(vertex_name, g, *it); cerr << " -> " << get(vertex_name, g, tail) << '\n'; } @@ -354,8 +362,9 @@ } vector lengths(nbranches); - transform(adj.first, adj.second, lengths.begin(), - bind1st(ptr_fun(getLength), &g)); + transform(adj.first, adj.second, lengths.begin(), [&g](const ContigNode& c) { + return getLength(&g, c); + }); unsigned minLength = *min_element(lengths.begin(), lengths.end()); unsigned maxLength = *max_element(lengths.begin(), lengths.end()); if (maxLength >= opt::maxLength) { @@ -364,18 +373,17 @@ g_count.tooLong++; if (opt::verbose > 1) #pragma omp critical(cerr) - cerr << minLength << '\t' << maxLength - << "\t0\t(too long)\n"; + cerr << minLength << '\t' << maxLength << "\t0\t(too long)\n"; return false; } - float identity = opt::identity == 0 ? 0 - : getAlignmentIdentity(g, v, tail, adj.first, adj.second); + float identity = + opt::identity == 0 ? 0 : getAlignmentIdentity(g, v, tail, adj.first, adj.second); bool dissimilar = identity < opt::identity; if (opt::verbose > 1) #pragma omp critical(cerr) cerr << minLength << '\t' << maxLength << '\t' << identity - << (dissimilar ? "\t(dissimilar)" : "") << '\n'; + << (dissimilar ? "\t(dissimilar)" : "") << '\n'; if (dissimilar) { // Insufficient identity. #pragma omp atomic @@ -390,14 +398,14 @@ } /** Add distances to a path. */ -static ContigPath addDistance(const Graph& g, const ContigPath& path) +static ContigPath +addDistance(const Graph& g, const ContigPath& path) { ContigPath out; out.reserve(path.size()); ContigNode u = path.front(); out.push_back(u); - for (ContigPath::const_iterator it = path.begin() + 1; - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) { ContigNode v = *it; int distance = getDistance(g, u, v); if (distance >= 0) { @@ -413,7 +421,8 @@ } /** Return the length of the longest path through the bubble. */ -static int longestPath(const Graph& g, const Bubble& topo) +static int +longestPath(const Graph& g, const Bubble& topo) { typedef graph_traits::edge_descriptor E; typedef graph_traits::out_edge_iterator Eit; @@ -422,8 +431,7 @@ EdgeWeightMap weight(g); map distance; distance[topo.front()] = 0; - for (Bubble::const_iterator it = topo.begin(); - it != topo.end(); ++it) { + for (Bubble::const_iterator it = topo.begin(); it != topo.end(); ++it) { V u = *it; Eit eit, elast; for (tie(eit, elast) = out_edges(u, g); eit != elast; ++eit) { @@ -440,7 +448,8 @@ * Add an edge (u,w) with the distance property set to the length of * the largest branch of the bubble. */ -static void scaffoldBubble(Graph& g, const Bubble& bubble) +static void +scaffoldBubble(Graph& g, const Bubble& bubble) { typedef graph_traits::vertex_descriptor V; assert(opt::scaffold); @@ -456,15 +465,15 @@ assert(bubble.size() > 2); size_t n = bubble.size() - 2; g_popped.reserve(g_popped.size() + n); - for (Bubble::const_iterator it = bubble.begin() + 1; - it != bubble.end() - 1; ++it) + for (Bubble::const_iterator it = bubble.begin() + 1; it != bubble.end() - 1; ++it) g_popped.push_back(it->contigIndex()); add_edge(u, w, max(longestPath(g, bubble), 1), g); } /** Pop the specified bubble if it is simple, otherwise scaffold. */ -static void popOrScaffoldBubble(Graph& g, const Bubble& bubble) +static void +popOrScaffoldBubble(Graph& g, const Bubble& bubble) { #pragma omp atomic g_count.bubbles++; @@ -476,20 +485,23 @@ } /** Return the length of the specified vertex in k-mer. */ -static unsigned getKmerLength(const ContigProperties& vp) +static unsigned +getKmerLength(const ContigProperties& vp) { assert(vp.length >= opt::k); return vp.length - opt::k + 1; } /** Return the mean k-mer coverage of the specified vertex. */ -static float getMeanCoverage(const ContigProperties& vp) +static float +getMeanCoverage(const ContigProperties& vp) { return (float)vp.coverage / getKmerLength(vp); } /** Remove contigs with insufficient coverage. */ -static void filterGraph(Graph& g) +static void +filterGraph(Graph& g) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -511,62 +523,82 @@ } } if (opt::verbose > 0) { - cerr << "Removed " << removedKmer << " k-mer in " - << removedContigs << " contigs with mean k-mer coverage " - "less than " << opt::minCoverage << ".\n"; + cerr << "Removed " << removedKmer << " k-mer in " << removedContigs + << " contigs with mean k-mer coverage " + "less than " + << opt::minCoverage << ".\n"; printGraphStats(cerr, g); } } /** Remove the specified contig from the adjacency graph. */ -static void removeContig(Graph* g, ContigID id) +static void +removeContig(Graph* g, ContigID id) { ContigNode v(id, false); g->clear_vertex(v); g->remove_vertex(v); } -int main(int argc, char** argv) +int +main(int argc, char** argv) { string commandLine; { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'a': arg >> opt::maxBranches; break; - case 'b': arg >> opt::maxLength; break; - case 'c': arg >> opt::minCoverage; break; - case 'g': arg >> opt::graphPath; break; - case 'j': arg >> opt::threads; break; - case 'k': arg >> opt::k; break; - case 'p': arg >> opt::identity; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); + case '?': + die = true; + break; + case 'a': + arg >> opt::maxBranches; + break; + case 'b': + arg >> opt::maxLength; + break; + case 'c': + arg >> opt::minCoverage; + break; + case 'g': + arg >> opt::graphPath; + break; + case 'j': + arg >> opt::threads; + break; + case 'k': + arg >> opt::k; + break; + case 'p': + arg >> opt::identity; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::k <= 0) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } @@ -581,8 +613,7 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -626,37 +657,33 @@ cout << "digraph bubbles {\n"; Bubbles bubbles = discoverBubbles(g); - for (Bubbles::const_iterator it = bubbles.begin(); - it != bubbles.end(); ++it) + for (Bubbles::const_iterator it = bubbles.begin(); it != bubbles.end(); ++it) popOrScaffoldBubble(g, *it); // Each bubble should be identified twice. Remove the duplicate. sort(g_popped.begin(), g_popped.end()); - g_popped.erase(unique(g_popped.begin(), g_popped.end()), - g_popped.end()); + g_popped.erase(unique(g_popped.begin(), g_popped.end()), g_popped.end()); if (opt::bubbleGraph) { cout << "}\n"; } else { - for (vector::const_iterator it = g_popped.begin(); - it != g_popped.end(); ++it) + for (vector::const_iterator it = g_popped.begin(); it != g_popped.end(); ++it) cout << get(g_contigNames, *it) << '\n'; } if (opt::verbose > 0) - cerr << "Bubbles: " << (g_count.bubbles + 1) / 2 - << " Popped: " << (g_count.popped + 1) / 2 - << " Scaffolds: " << (g_count.scaffold + 1) / 2 - << " Complex: " << (g_count.notSimple + 1) / 2 - << " Too long: " << (g_count.tooLong + 1) / 2 - << " Too many: " << (g_count.tooMany + 1) / 2 - << " Dissimilar: " << (g_count.dissimilar + 1) / 2 - << '\n'; + cerr << "Bubbles: " << (g_count.bubbles + 1) / 2 << " Popped: " << (g_count.popped + 1) / 2 + << " Scaffolds: " << (g_count.scaffold + 1) / 2 + << " Complex: " << (g_count.notSimple + 1) / 2 + << " Too long: " << (g_count.tooLong + 1) / 2 + << " Too many: " << (g_count.tooMany + 1) / 2 + << " Dissimilar: " << (g_count.dissimilar + 1) / 2 << '\n'; if (!opt::graphPath.empty()) { // Remove the popped contigs from the adjacency graph. - for_each(g_popped.begin(), g_popped.end(), - bind1st(ptr_fun(removeContig), &g)); + for_each(g_popped.begin(), g_popped.end(), [&g](const ContigID& c) { + return removeContig(&g, c); + }); // Assemble unambiguous paths. g_contigNames.unlock(); @@ -669,21 +696,18 @@ assemble_stranded(g, back_inserter(paths)); else assemble(g, back_inserter(paths)); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { ContigNode u(numContigs + it - paths.begin(), false); string name = createContigName(); put(vertex_name, g, u, name); - cout << name << '\t' - << addDistance(gorig, *it) << '\n'; + cout << name << '\t' << addDistance(gorig, *it) << '\n'; } } else { if (opt::ss) assemble_stranded(g, back_inserter(paths)); else assemble(g, back_inserter(paths)); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { ContigNode u(numContigs + it - paths.begin(), false); string name = createContigName(); put(vertex_name, g, u, name); diff -Nru abyss-2.2.3/README.md abyss-2.2.4/README.md --- abyss-2.2.3/README.md 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/README.md 2020-01-30 18:48:44.000000000 +0000 @@ -1,3 +1,8 @@ +[![Release](https://img.shields.io/github/release/bcgsc/abyss.svg)](https://github.com/bcgsc/abyss/releases) +[![Downloads](https://img.shields.io/github/downloads/bcgsc/abyss/total?logo=github)](https://github.com/bcgsc/abyss/releases/download/2.2.3/abyss-2.2.3.tar.gz) +[![Conda](https://img.shields.io/conda/dn/bioconda/abyss?label=Conda)](https://anaconda.org/bioconda/abyss) +[![Issues](https://img.shields.io/github/issues/bcgsc/abyss.svg)](https://github.com/bcgsc/abyss/issues) + ABySS ===== diff -Nru abyss-2.2.3/Scaffold/scaffold.cc abyss-2.2.4/Scaffold/scaffold.cc --- abyss-2.2.3/Scaffold/scaffold.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Scaffold/scaffold.cc 2020-01-30 18:48:44.000000000 +0000 @@ -1,12 +1,10 @@ -#include "config.h" +#include "Common/UnorderedMap.h" #include "ContigNode.h" #include "ContigPath.h" #include "ContigProperties.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "Estimate.h" -#include "IOUtil.h" -#include "Iterator.h" -#include "Uncompress.h" -#include "Common/UnorderedMap.h" #include "Graph/Assemble.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" @@ -15,6 +13,10 @@ #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" #include "Graph/PopBubbles.h" +#include "IOUtil.h" +#include "Iterator.h" +#include "Uncompress.h" +#include "config.h" #include #include #include @@ -24,8 +26,6 @@ #include #include #include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; using namespace std::rel_ops; @@ -37,130 +37,144 @@ DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... FASTA|OVERLAP DIST...\n" -"Scaffold contigs using the distance estimate graph.\n" -"\n" -" Arguments:\n" -"\n" -" FASTA contigs in FASTA format\n" -" OVERLAP the contig overlap graph\n" -" DIST estimates of the distance between contigs\n" -"\n" -" Options:\n" -"\n" -" -n, --npairs=N minimum number of pairs [0]\n" -" or -n A-B:S Find the value of n in [A,B] with step size S\n" -" that maximizes the scaffold N50.\n" -" Default value for the step size is 1, if unspecified.\n" -" -s, --seed-length=N minimum contig length [1000]\n" -" or -s A-B Find the value of s in [A,B]\n" -" that maximizes the scaffold N50.\n" -" --grid optimize using a grid search [default]\n" -" --line optimize using a line search\n" -" -k, --kmer=N length of a k-mer\n" -" -G, --genome-size=N expected genome size. Used to calculate NG50\n" -" and associated stats [disabled]\n" -" --min-gap=N minimum scaffold gap length to output [50]\n" -" --max-gap=N maximum scaffold gap length to output [inf]\n" -" --complex remove complex transitive edges\n" -" --no-complex don't remove complex transitive edges [default]\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -o, --out=FILE write the paths to FILE\n" -" -g, --graph=FILE write the graph to FILE\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for sqlite\n" -" --strain=NAME specify strain NAME for sqlite\n" -" --species=NAME specify species NAME for sqlite\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... FASTA|OVERLAP DIST...\n" + "Scaffold contigs using the distance estimate graph.\n" + "\n" + " Arguments:\n" + "\n" + " FASTA contigs in FASTA format\n" + " OVERLAP the contig overlap graph\n" + " DIST estimates of the distance between contigs\n" + "\n" + " Options:\n" + "\n" + " -n, --npairs=N minimum number of pairs [0]\n" + " or -n A-B:S Find the value of n in [A,B] with step size S\n" + " that maximizes the scaffold N50.\n" + " Default value for the step size is 1, if unspecified.\n" + " -s, --seed-length=N minimum contig length [1000]\n" + " or -s A-B Find the value of s in [A,B]\n" + " that maximizes the scaffold N50.\n" + " --grid optimize using a grid search [default]\n" + " --line optimize using a line search\n" + " -k, --kmer=N length of a k-mer\n" + " -G, --genome-size=N expected genome size. Used to calculate NG50\n" + " and associated stats [disabled]\n" + " --min-gap=N minimum scaffold gap length to output [50]\n" + " --max-gap=N maximum scaffold gap length to output [inf]\n" + " --complex remove complex transitive edges\n" + " --no-complex don't remove complex transitive edges [default]\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -o, --out=FILE write the paths to FILE\n" + " -g, --graph=FILE write the graph to FILE\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for sqlite\n" + " --strain=NAME specify strain NAME for sqlite\n" + " --species=NAME specify species NAME for sqlite\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; +string db; +dbVars metaVars; - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Optimization search strategy. */ - static int searchStrategy; +/** Optimization search strategy. */ +static int searchStrategy; - /** Minimum number of pairs. */ - static unsigned minEdgeWeight; - static unsigned minEdgeWeightEnd; - static unsigned minEdgeWeightStep; +/** Minimum number of pairs. */ +static unsigned minEdgeWeight; +static unsigned minEdgeWeightEnd; +static unsigned minEdgeWeightStep; - /** Minimum contig length. */ - static unsigned minContigLength = 1000; - static unsigned minContigLengthEnd = 1000; +/** Minimum contig length. */ +static unsigned minContigLength = 1000; +static unsigned minContigLengthEnd = 1000; - /** Genome size. Used to calculate NG50. */ - static long long unsigned genomeSize; +/** Genome size. Used to calculate NG50. */ +static long long unsigned genomeSize; - /** Minimum scaffold gap length to output. */ - static int minGap = 50; +/** Minimum scaffold gap length to output. */ +static int minGap = 50; - /** Maximum scaffold gap length to output. - * -ve value means no maximum. */ - static int maxGap = -1; +/** Maximum scaffold gap length to output. + * -ve value means no maximum. */ +static int maxGap = -1; - /** Write the paths to this file. */ - static string out; +/** Write the paths to this file. */ +static string out; - /** Write the graph to this file. */ - static string graphPath; +/** Write the graph to this file. */ +static string graphPath; - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Verbose output. */ - int verbose; // used by PopBubbles +/** Verbose output. */ +int verbose; // used by PopBubbles - /** Output format */ - int format = DOT; // used by DistanceEst +/** Output format */ +int format = DOT; // used by DistanceEst - /** Remove complex transitive edges */ - static int comp_trans; +/** Remove complex transitive edges */ +static int comp_trans; } static const char shortopts[] = "G:g:k:n:o:s:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP, - OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_MIN_GAP, + OPT_MAX_GAP, + OPT_COMP, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES +}; /** Optimization search strategy. */ -enum { GRID_SEARCH, LINE_SEARCH }; +enum +{ + GRID_SEARCH, + LINE_SEARCH +}; static const struct option longopts[] = { - { "graph", no_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, + { "graph", no_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, { "genome-size", required_argument, NULL, 'G' }, - { "min-gap", required_argument, NULL, OPT_MIN_GAP }, - { "max-gap", required_argument, NULL, OPT_MAX_GAP }, - { "npairs", required_argument, NULL, 'n' }, - { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH }, - { "line", no_argument, &opt::searchStrategy, LINE_SEARCH }, - { "out", required_argument, NULL, 'o' }, + { "min-gap", required_argument, NULL, OPT_MIN_GAP }, + { "max-gap", required_argument, NULL, OPT_MAX_GAP }, + { "npairs", required_argument, NULL, 'n' }, + { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH }, + { "line", no_argument, &opt::searchStrategy, LINE_SEARCH }, + { "out", required_argument, NULL, 'o' }, { "seed-length", required_argument, NULL, 's' }, - { "complex", no_argument, &opt::comp_trans, 1 }, - { "no-complex", no_argument, &opt::comp_trans, 0 }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, + { "complex", no_argument, &opt::comp_trans, 1 }, + { "no-complex", no_argument, &opt::comp_trans, 0 }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, { NULL, 0, NULL, 0 } }; @@ -172,8 +186,11 @@ * An edge is invalid when the overlap is larger than the length of * either of its incident sequences. */ -struct InvalidEdge { - InvalidEdge(Graph& g) : m_g(g) { } +struct InvalidEdge +{ + InvalidEdge(Graph& g) + : m_g(g) + {} bool operator()(graph_traits::edge_descriptor e) const { int d = m_g[e].distance; @@ -185,8 +202,12 @@ }; /** Return whether the specified edges has sufficient support. */ -struct PoorSupport { - PoorSupport(Graph& g, unsigned minEdgeWeight) : m_g(g), m_minEdgeWeight(minEdgeWeight) { } +struct PoorSupport +{ + PoorSupport(Graph& g, unsigned minEdgeWeight) + : m_g(g) + , m_minEdgeWeight(minEdgeWeight) + {} bool operator()(graph_traits::edge_descriptor e) const { return m_g[e].numPairs < m_minEdgeWeight; @@ -196,7 +217,8 @@ }; /** Remove short vertices and unsupported edges from the graph. */ -static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength) +static void +filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -230,13 +252,15 @@ } /** Return true if the specified edge is a cycle. */ -static bool isCycle(Graph& g, graph_traits::edge_descriptor e) +static bool +isCycle(Graph& g, graph_traits::edge_descriptor e) { return edge(target(e, g), source(e, g), g).second; } /** Remove simple cycles of length two from the graph. */ -static void removeCycles(Graph& g) +static void +removeCycles(Graph& g) { typedef graph_traits::edge_descriptor E; typedef graph_traits::edge_iterator Eit; @@ -265,7 +289,8 @@ * For a pair of edges (u,v1) and (u,v2) in g, if exactly one of the * edges (v1,v2) or (v2,v1) exists in g0, add that edge to g. */ -static void resolveForks(Graph& g, const Graph& g0) +static void +resolveForks(Graph& g, const Graph& g0) { typedef graph_traits::adjacency_iterator Vit; typedef graph_traits::edge_descriptor E; @@ -293,25 +318,22 @@ pair e21 = edge(v2, v1, g0); if (e12.second && e21.second) { if (opt::verbose > 1) - cerr << "cycle: " << get(vertex_name, g, v1) - << ' ' << get(vertex_name, g, v2) << '\n'; + cerr << "cycle: " << get(vertex_name, g, v1) << ' ' + << get(vertex_name, g, v2) << '\n'; } else if (e12.second || e21.second) { E e = e12.second ? e12.first : e21.first; V v = source(e, g0), w = target(e, g0); add_edge(v, w, g0[e], g); numEdges++; if (opt::verbose > 1) - cerr << get(vertex_name, g, u) - << " -> " << get(vertex_name, g, v) - << " -> " << get(vertex_name, g, w) - << " [" << g0[e] << "]\n"; + cerr << get(vertex_name, g, u) << " -> " << get(vertex_name, g, v) << " -> " + << get(vertex_name, g, w) << " [" << g0[e] << "]\n"; } } } } if (opt::verbose > 0) - cerr << "Added " << numEdges - << " edges to ambiguous vertices.\n"; + cerr << "Added " << numEdges << " edges to ambiguous vertices.\n"; if (!opt::db.empty()) addToDb(db, "E_added_ambig", numEdges); } @@ -320,7 +342,8 @@ * For an edge (u,v), remove the vertex v if deg+(u) > 1 * and deg-(v) = 1 and deg+(v) = 0. */ -static void pruneTips(Graph& g) +static void +pruneTips(Graph& g) { /** Identify the tips. */ size_t n = 0; @@ -340,7 +363,8 @@ * operation: remove vertex u * output: digraph g { t1->v1 t2->v2 } */ -static void removeRepeats(Graph& g) +static void +removeRepeats(Graph& g) { typedef graph_traits::adjacency_iterator Ait; typedef graph_traits::edge_descriptor E; @@ -349,28 +373,23 @@ vector repeats; vector transitive; find_transitive_edges(g, back_inserter(transitive)); - for (vector::const_iterator it = transitive.begin(); - it != transitive.end(); ++it) { + for (vector::const_iterator it = transitive.begin(); it != transitive.end(); ++it) { // Iterate through the transitive edges, u->w1. V u = source(*it, g), w1 = target(*it, g); Ait vit, vlast; - for (tie(vit, vlast) = adjacent_vertices(u, g); - vit != vlast; ++vit) { + for (tie(vit, vlast) = adjacent_vertices(u, g); vit != vlast; ++vit) { V v = *vit; assert(u != v); // no self loops if (!edge(v, w1, g).second) continue; // u->w1 is a transitive edge spanning u->v->w1. Ait wit, wlast; - for (tie(wit, wlast) = adjacent_vertices(v, g); - wit != wlast; ++wit) { + for (tie(wit, wlast) = adjacent_vertices(v, g); wit != wlast; ++wit) { // For each edge v->w2, check that an edge // w1->w2 or w2->w1 exists. If not, v is a repeat. V w2 = *wit; assert(v != w2); // no self loops - if (w1 != w2 - && !edge(w1, w2, g).second - && !edge(w2, w1, g).second) { + if (w1 != w2 && !edge(w1, w2, g).second && !edge(w2, w1, g).second) { repeats.push_back(v); break; } @@ -379,20 +398,17 @@ } sort(repeats.begin(), repeats.end()); - repeats.erase(unique(repeats.begin(), repeats.end()), - repeats.end()); + repeats.erase(unique(repeats.begin(), repeats.end()), repeats.end()); if (opt::verbose > 1) { cerr << "Ambiguous:"; - for (vector::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (vector::const_iterator it = repeats.begin(); it != repeats.end(); ++it) cerr << ' ' << get(vertex_name, g, *it); cerr << '\n'; } // Remove the repetitive vertices. unsigned numRemoved = 0; - for (vector::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) { + for (vector::const_iterator it = repeats.begin(); it != repeats.end(); ++it) { V u = *it; V uc = get(vertex_complement, g, u); clear_out_edges(u, g); @@ -403,10 +419,8 @@ } if (opt::verbose > 0) { - cerr << "Cleared " - << repeats.size() << " ambiguous vertices.\n" - << "Removed " - << numRemoved << " ambiguous vertices.\n"; + cerr << "Cleared " << repeats.size() << " ambiguous vertices.\n" + << "Removed " << numRemoved << " ambiguous vertices.\n"; printGraphStats(cerr, g); } if (!opt::db.empty()) { @@ -421,7 +435,8 @@ * operation: remove edge u1->v2 * output: digraph g {u1->v1 u2->v2 } */ -static void removeWeakEdges(Graph& g) +static void +removeWeakEdges(Graph& g) { typedef graph_traits::edge_descriptor E; typedef graph_traits::edge_iterator Eit; @@ -478,11 +493,9 @@ if (opt::verbose > 1) { cerr << "Weak edges:\n"; - for (vector::const_iterator it = weak.begin(); - it != weak.end(); ++it) { + for (vector::const_iterator it = weak.begin(); it != weak.end(); ++it) { E e = *it; - cerr << '\t' << get(edge_name, g, e) - << " [" << g[e] << "]\n"; + cerr << '\t' << get(edge_name, g, e) << " [" << g[e] << "]\n"; } } @@ -496,7 +509,8 @@ addToDb(db, "E_removed_weak", weak.size()); } -static void removeLongEdges(Graph& g) +static void +removeLongEdges(Graph& g) { typedef graph_traits::edge_descriptor E; typedef graph_traits::edge_iterator Eit; @@ -514,7 +528,8 @@ /** Return whether the specified distance estimate is an exact * overlap. */ -static bool isOverlap(const DistanceEst& d) +static bool +isOverlap(const DistanceEst& d) { if (d.stdDev == 0) { assert(d.distance < 0); @@ -527,8 +542,8 @@ * @param g0 the original graph * @param g1 the transformed graph */ -static ContigPath addDistEst(const Graph& g0, const Graph& g1, - const ContigPath& path) +static ContigPath +addDistEst(const Graph& g0, const Graph& g1, const ContigPath& path) { typedef graph_traits::edge_descriptor E; typedef edge_bundle_type::type EP; @@ -537,14 +552,14 @@ out.reserve(2 * path.size()); ContigNode u = path.front(); out.push_back(u); - for (ContigPath::const_iterator it = path.begin() + 1; - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) { ContigNode v = *it; assert(!v.ambiguous()); pair e0 = edge(u, v, g0); pair e1 = edge(u, v, g1); if (!e0.second && !e1.second) - std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " << get(vertex_name, g0, v) << '\n'; + std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " + << get(vertex_name, g0, v) << '\n'; assert(e0.second || e1.second); const EP& ep = e0.second ? g0[e0.first] : g1[e1.first]; if (!isOverlap(ep)) { @@ -561,7 +576,8 @@ } /** Read a graph from the specified file. */ -static void readGraph(const string& path, Graph& g) +static void +readGraph(const string& path, Graph& g) { if (opt::verbose > 0) cerr << "Reading `" << path << "'...\n"; @@ -574,17 +590,16 @@ printGraphStats(cerr, g); vector vals = passGraphStatsVal(g); - vector keys = make_vector() - << "V_readGraph" - << "E_readGraph" - << "degree0_readGraph" - << "degree1_readGraph" - << "degree234_readGraph" - << "degree5_readGraph" - << "max_readGraph"; + vector keys = make_vector() << "V_readGraph" + << "E_readGraph" + << "degree0_readGraph" + << "degree1_readGraph" + << "degree234_readGraph" + << "degree5_readGraph" + << "max_readGraph"; if (!opt::db.empty()) { - for(unsigned i=0; i -unsigned addLength(const Graph& g, It first, It last) +unsigned +addLength(const Graph& g, It first, It last) { typedef typename graph_traits::vertex_descriptor V; assert(first != last); @@ -610,11 +626,11 @@ typedef vector ContigPaths; /** - * Build the scaffold length histogram. - * @param g The graph g is destroyed. - */ -static Histogram buildScaffoldLengthHistogram( - Graph& g, const ContigPaths& paths) + * Build the scaffold length histogram. + * @param g The graph g is destroyed. + */ +static Histogram +buildScaffoldLengthHistogram(Graph& g, const ContigPaths& paths) { Histogram h; @@ -626,11 +642,10 @@ // Remove the vertices that are used in paths // and add the lengths of the scaffolds. - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { h.insert(addLength(g, it->begin(), it->end())); - remove_vertex_if(g, it->begin(), it->end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); }); } // Add the contigs that were not used in paths. @@ -645,51 +660,63 @@ } /** Add contiguity stats to database */ -static void addCntgStatsToDb( - const Histogram h, const unsigned min) +static void +addCntgStatsToDb(const Histogram h, const unsigned min) { vector vals = passContiguityStatsVal(h, min); - vector keys = make_vector() - << "n" - << "n200" - << "nN50" - << "min" - << "N75" - << "N50" - << "N25" - << "Esize" - << "max" - << "sum" - << "nNG50" - << "NG50"; + vector keys = make_vector() << "n" + << "n200" + << "nN50" + << "min" + << "N75" + << "N50" + << "N25" + << "Esize" + << "max" + << "sum" + << "nNG50" + << "NG50"; if (!opt::db.empty()) { - for(unsigned i=0; i struct hash { - size_t operator()(const ScaffoldParam& param) const - { - return hash()(param.n) ^ hash()(param.s); - } - }; +template<> +struct hash +{ + size_t operator()(const ScaffoldParam& param) const + { + return hash()(param.n) ^ hash()(param.s); + } +}; NAMESPACE_STD_HASH_END /** Result of scaffolding. */ -struct ScaffoldResult : ScaffoldParam{ - ScaffoldResult() : ScaffoldParam(0, 0), n50(0) { } +struct ScaffoldResult : ScaffoldParam +{ + ScaffoldResult() + : ScaffoldParam(0, 0) + , n50(0) + {} ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics) - : ScaffoldParam(n, s), n50(n50), metrics(metrics) { } + : ScaffoldParam(n, s) + , n50(n50) + , metrics(metrics) + {} unsigned n50; std::string metrics; }; @@ -699,9 +726,7 @@ * @return the scaffold N50 */ ScaffoldResult -scaffold(const Graph& g0, - unsigned minEdgeWeight, unsigned minContigLength, - bool output) +scaffold(const Graph& g0, unsigned minEdgeWeight, unsigned minContigLength, bool output) { Graph g(g0); @@ -744,8 +769,7 @@ typedef graph_traits::vertex_descriptor V; vector popped = popBubbles(g); if (opt::verbose > 0) { - cerr << "Removed " << popped.size() - << " vertices in bubbles.\n"; + cerr << "Removed " << popped.size() << " vertices in bubbles.\n"; printGraphStats(cerr, g); } @@ -754,8 +778,7 @@ if (opt::verbose > 1) { cerr << "Popped:"; - for (vector::const_iterator it = popped.begin(); - it != popped.end(); ++it) + for (vector::const_iterator it = popped.begin(); it != popped.end(); ++it) cerr << ' ' << get(vertex_name, g, *it); cerr << '\n'; } @@ -773,11 +796,9 @@ sort(paths.begin(), paths.end()); unsigned n = 0; if (opt::verbose > 0) { - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) n += it->size(); - cerr << "Assembled " << n << " contigs in " - << paths.size() << " scaffolds.\n"; + cerr << "Assembled " << n << " contigs in " << paths.size() << " scaffolds.\n"; printGraphStats(cerr, g); } @@ -792,10 +813,8 @@ ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout; assert_good(out, opt::out); g_contigNames.unlock(); - for (vector::const_iterator it = paths.begin(); - it != paths.end(); ++it) - out << createContigName() << '\t' - << addDistEst(g0, g, *it) << '\n'; + for (vector::const_iterator it = paths.begin(); it != paths.end(); ++it) + out << createContigName() << '\t' << addDistEst(g0, g, *it) << '\n'; assert_good(out, opt::out); // Output the graph. @@ -811,15 +830,16 @@ const unsigned STATS_MIN_LENGTH = opt::minContigLength; std::ostringstream ss; Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths); - printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, - false, "\t", opt::genomeSize) - << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n'; + printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, false, "\t", opt::genomeSize) + << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n'; std::string metrics = ss.str(); addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH); - return ScaffoldResult(minEdgeWeight, minContigLength, - scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(), - metrics); + return ScaffoldResult( + minEdgeWeight, + minContigLength, + scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(), + metrics); } /** Memoize the optimization results so far. */ @@ -858,10 +878,11 @@ /** Find the value of n that maximizes the scaffold N50. */ static ScaffoldResult -optimize_n(const Graph& g, - std::pair minEdgeWeight, - unsigned minContigLength, - ScaffoldMemo& memo) +optimize_n( + const Graph& g, + std::pair minEdgeWeight, + unsigned minContigLength, + ScaffoldMemo& memo) { std::string metrics_table; unsigned bestn = 0, bestN50 = 0; @@ -879,19 +900,17 @@ /** Find the value of s that maximizes the scaffold N50. */ static ScaffoldResult -optimize_s(const Graph& g, - unsigned minEdgeWeight, - std::pair minContigLength, - ScaffoldMemo& memo) +optimize_s( + const Graph& g, + unsigned minEdgeWeight, + std::pair minContigLength, + ScaffoldMemo& memo) { std::string metrics_table; unsigned bests = 0, bestN50 = 0; const double STEP = cbrt(10); // Three steps per decade. - unsigned ilast = (unsigned)round( - log(minContigLength.second) / log(STEP)); - for (unsigned i = (unsigned)round( - log(minContigLength.first) / log(STEP)); - i <= ilast; ++i) { + unsigned ilast = (unsigned)round(log(minContigLength.second) / log(STEP)); + for (unsigned i = (unsigned)round(log(minContigLength.first) / log(STEP)); i <= ilast; ++i) { unsigned s = (unsigned)pow(STEP, (int)i); // Round to 1 figure. @@ -911,9 +930,10 @@ /** Find the values of n and s that maximizes the scaffold N50. */ static ScaffoldResult -optimize_grid_search(const Graph& g, - std::pair minEdgeWeight, - std::pair minContigLength) +optimize_grid_search( + const Graph& g, + std::pair minEdgeWeight, + std::pair minContigLength) { const unsigned STATS_MIN_LENGTH = opt::minContigLength; if (opt::verbose == 0) @@ -935,9 +955,10 @@ /** Find the values of n and s that maximizes the scaffold N50. */ static ScaffoldResult -optimize_line_search(const Graph& g, - std::pair minEdgeWeight, - std::pair minContigLength) +optimize_line_search( + const Graph& g, + std::pair minEdgeWeight, + std::pair minContigLength) { const unsigned STATS_MIN_LENGTH = opt::minContigLength; if (opt::verbose == 0) @@ -946,10 +967,10 @@ ScaffoldMemo memo; std::string metrics_table; ScaffoldResult best( - (minEdgeWeight.first + minEdgeWeight.second) / 2, - minContigLength.second, 0, ""); + (minEdgeWeight.first + minEdgeWeight.second) / 2, minContigLength.second, 0, ""); // An upper limit on the number of iterations. - const unsigned MAX_ITERATIONS = 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep; + const unsigned MAX_ITERATIONS = + 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep; for (unsigned i = 0; i < MAX_ITERATIONS; ++i) { // Optimize s. if (opt::verbose > 0) { @@ -981,33 +1002,32 @@ } /** Run abyss-scaffold. */ -int main(int argc, char** argv) +int +main(int argc, char** argv) { if (!opt::db.empty()) opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': + case '?': die = true; break; - case 'k': + case 'k': arg >> opt::k; break; - case 'G': - { - double x; - arg >> x; - opt::genomeSize = x; - break; - } - case 'g': + case 'G': { + double x; + arg >> x; + opt::genomeSize = x; + break; + } + case 'g': arg >> opt::graphPath; break; - case 'n': + case 'n': arg >> opt::minEdgeWeight; if (arg.peek() == '-') { arg >> expect("-") >> opt::minEdgeWeightEnd; @@ -1019,56 +1039,55 @@ else opt::minEdgeWeightStep = 1; break; - case 'o': + case 'o': arg >> opt::out; break; - case 's': + case 's': arg >> opt::minContigLength; if (arg.peek() == '-') { opt::minContigLengthEnd = 100 * opt::minContigLength; arg >> expect("-") >> opt::minContigLengthEnd; - assert(opt::minContigLength - <= opt::minContigLengthEnd); + assert(opt::minContigLength <= opt::minContigLengthEnd); } else opt::minContigLengthEnd = opt::minContigLength; break; - case 'v': + case 'v': opt::verbose++; break; - case OPT_MIN_GAP: + case OPT_MIN_GAP: arg >> opt::minGap; break; - case OPT_MAX_GAP: + case OPT_MAX_GAP: arg >> opt::maxGap; break; - case OPT_HELP: + case OPT_HELP: cout << USAGE_MESSAGE; exit(EXIT_SUCCESS); - case OPT_VERSION: + case OPT_VERSION: cout << VERSION_MESSAGE; exit(EXIT_SUCCESS); - case OPT_DB: + case OPT_DB: arg >> opt::db; break; - case OPT_LIBRARY: + case OPT_LIBRARY: arg >> opt::metaVars[0]; break; - case OPT_STRAIN: + case OPT_STRAIN: arg >> opt::metaVars[1]; break; - case OPT_SPECIES: + case OPT_SPECIES: arg >> opt::metaVars[2]; break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::k <= 0) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } @@ -1078,17 +1097,11 @@ } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); addToDb(db, "K", opt::k); } @@ -1114,15 +1127,14 @@ remove_edge_if(InvalidEdge(g), static_cast(g)); unsigned numRemoved = numBefore - num_edges(g); if (numRemoved > 0) - cerr << "warning: Removed " - << numRemoved << " invalid edges.\n"; + cerr << "warning: Removed " << numRemoved << " invalid edges.\n"; if (!opt::db.empty()) addToDb(db, "Edges_invalid", numRemoved); const unsigned STATS_MIN_LENGTH = opt::minContigLength; - if (opt::minEdgeWeight == opt::minEdgeWeightEnd - && opt::minContigLength == opt::minContigLengthEnd) { + if (opt::minEdgeWeight == opt::minEdgeWeightEnd && + opt::minContigLength == opt::minContigLengthEnd) { ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true); // Print assembly contiguity statistics. if (opt::verbose > 0) @@ -1132,19 +1144,21 @@ } else { ScaffoldResult best(0, 0, 0, ""); switch (opt::searchStrategy) { - case GRID_SEARCH: - best = optimize_grid_search(g, - std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), - std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); - break; - case LINE_SEARCH: - best = optimize_line_search(g, - std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), - std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); - break; - default: - abort(); - break; + case GRID_SEARCH: + best = optimize_grid_search( + g, + std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), + std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); + break; + case LINE_SEARCH: + best = optimize_line_search( + g, + std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), + std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); + break; + default: + abort(); + break; } if (opt::verbose > 0) @@ -1158,7 +1172,9 @@ std::cerr << best.metrics; } - std::cerr << '\n' << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s << ".\n"; + std::cerr << '\n' + << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s + << ".\n"; // Print assembly contiguity statistics. std::cerr << '\n'; diff -Nru abyss-2.2.3/Sealer/README.md abyss-2.2.4/Sealer/README.md --- abyss-2.2.3/Sealer/README.md 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Sealer/README.md 2020-01-30 18:48:44.000000000 +0000 @@ -24,7 +24,7 @@ Description =========== -Sealer is an application of Konnector that closes intra-scaffold gaps. It performs three sequential functions. First, regions with Ns are identified from an input scaffold. Flanking nucleotues (2 x 100bp) are extracted from those regions while respecting the strand (5' to 3') direction on the sequence immediately downstream of each gap. In the second step, flanking sequence pairs are used as input to Konnector along with a set of reads with a high level of coverage redundancy. Ideally, the reads should represent the original dataset from which the draft assembly is generated, or further whole genome shotgun (WGS) sequencing data generated from the same sample. Within Konnector, the input WGS reads are used to populate a Bloom filter, tiling the reads with a sliding window of length *k*, thus generating a probabilistic representation of all the *k*-mers in the reads. Konnector also uses crude error removal and correctional algorithms, eliminating singletons (*k*-mers that are observed only once) and fixing base mismatches in the flanking sequence pairs. Sealer launches Konnector processes using a user-input range of *k*-mer lengths. In the third and final operation, succesfully merged sequences are inserted into the gaps of the original scaffolds, and Sealer outputs a new gap-filled scaffold file. +Sealer is an application of Konnector that closes intra-scaffold gaps. It performs three sequential functions. First, regions with Ns are identified from an input scaffold. Flanking nucleotues (2 x 100bp) are extracted from those regions while respecting the strand (5' to 3') direction on the sequence immediately downstream of each gap. In the second step, flanking sequence pairs are used as input to Konnector along with a set of reads with a high level of coverage redundancy. Ideally, the reads should represent the original dataset from which the draft assembly is generated, or further whole genome shotgun (WGS) sequencing data generated from the same sample. Within Konnector, the input WGS reads are used to populate a Bloom filter, tiling the reads with a sliding window of length *k*, thus generating a probabilistic representation of all the *k*-mers in the reads. Konnector also uses crude error removal and correctional algorithms, eliminating singletons (*k*-mers that are observed only once) and fixing base mismatches in the flanking sequence pairs. Sealer launches Konnector processes using a user-input range of *k*-mer lengths. In the third and final operation, successfully merged sequences are inserted into the gaps of the original scaffolds, and Sealer outputs a new gap-filled scaffold file. Installation ============ @@ -159,6 +159,6 @@ * `--help`: display this help and exit * `--version`: output version information and exit -*k* is the size of *k*-mer for the de Bruijn graph. You may specify multiple values of *k*, which will increase the nubmer of gaps closed at the cost of increased run time. Multiple values of *k* ought to be specified in increasing order, as lower values of *k* have fewer coverage gaps and are less likely to misassemble. +*k* is the size of *k*-mer for the de Bruijn graph. You may specify multiple values of *k*, which will increase the number of gaps closed at the cost of increased run time. Multiple values of *k* ought to be specified in increasing order, as lower values of *k* have fewer coverage gaps and are less likely to misassemble. *P* is the threshold for number of paths allowed to be traversed. When set to 10, Konnector will attempt to close gaps even when there are 10 different paths found. It would attempt to create a consensus sequence between these paths. The default setting is 2. diff -Nru abyss-2.2.3/Sealer/sealer.cc abyss-2.2.4/Sealer/sealer.cc --- abyss-2.2.3/Sealer/sealer.cc 2019-09-27 20:02:10.000000000 +0000 +++ abyss-2.2.4/Sealer/sealer.cc 2020-01-30 18:48:44.000000000 +0000 @@ -71,7 +71,6 @@ " --print-flanks outputs flank files\n" " -S, --input-scaffold=FILE load scaffold from FILE\n" " -L, --flank-length=N length of flanks to be used as pseudoreads [100]\n" -" -D, --flank-distance=N distance of flank from gap [0]\n" " -G, --max-gap-length=N max gap size to fill in bp [800]; runtime increases\n" " exponentially with respect to this parameter\n" " -j, --threads=N use N parallel threads [1]\n"