Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Account for overlapping reads in Theoretical Sensitivity model #1802

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
10 changes: 8 additions & 2 deletions src/main/java/picard/analysis/CollectWgsMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ public class CollectWgsMetrics extends CommandLineProgram {
@Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true)
public List<Double> ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5));

@Argument(doc="Maximum value allowed for base quality resulting from overlapping reads. Often this is governed by PCR error rate. (Phred scale)", optional = true)
public int MAX_OVERLAP_BASE_QUAL = 45;

@Argument(doc = "If true, fast algorithm is used.")
public boolean USE_FAST_ALGORITHM = false;

Expand Down Expand Up @@ -242,8 +245,11 @@ protected int doWork() {
if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
// Write out theoretical sensitivity results.
final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION);
log.info("Calculating theoretical sensitivity at " + ALLELE_FRACTION.size() + " allele fractions.");

List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE,
collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION,
collector.getMetrics(dupeFilter, adapterFilter, mapqFilter, pairFilter).PCT_EXC_OVERLAP, MAX_OVERLAP_BASE_QUAL);
theoreticalSensitivityMetrics.addAllMetrics(tsm);
theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
}
Expand Down
71 changes: 40 additions & 31 deletions src/main/java/picard/analysis/TheoreticalSensitivity.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ public class TheoreticalSensitivity {
private static final Log log = Log.getInstance(TheoreticalSensitivity.class);
private static final int SAMPLING_MAX = 600; //prevent 'infinite' loops
private static final int MAX_CONSIDERED_DEPTH_HET_SENS = 1000; // No point in looking any deeper than this, otherwise GC overhead is too high. Only used for HET sensitivity.
private static final int LARGE_NUMBER_OF_DRAWS = 10; // The number of draws at which we believe a Gaussian approximation to sum random variables.
private static final double DEPTH_BIN_WIDTH = 0.01; // Minimal fraction of depth histogram to use when integrating theoretical sensitivity. This ensures we don't calculate theoretical sensitivity at every depth, which would be computationally expensive.
private static final int RANDOM_SEED = 51;

Expand Down Expand Up @@ -241,9 +240,14 @@ public TheoreticalSensitivity() {
* @param sampleSize sampleSize is the total number of simulations to run
* @param alleleFraction the allele fraction to evaluate sensitivity at
* @param randomSeed random number seed to use for random number generator
* @param overlapProbability Probability a position on a read overlaps with its mate
* @param maxOverlapBaseQuality Maximum base quality that can result from summing overlapping base qualities, often capped by PCR error rate
* @return Theoretical sensitivity for the given arguments at a constant depth.
*/
public static double sensitivityAtConstantDepth(final int depth, final Histogram<Integer> qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final long randomSeed) {
public static double sensitivityAtConstantDepth(final int depth, final Histogram<Integer> qualityHistogram,
final double logOddsThreshold, final int sampleSize,
final double alleleFraction, final long randomSeed,
final double overlapProbability, final int maxOverlapBaseQuality) {
// If the depth is 0 at a particular locus, the sensitivity is trivially 0.0.
if (depth == 0) {
return 0.0;
Expand All @@ -264,37 +268,28 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram
for (int sample = 0; sample < sampleSize; sample++) {
final int altDepth = bd.sample();

final int sumOfQualities;
if (altDepth < LARGE_NUMBER_OF_DRAWS) {
// If the number of alt reads is "small" we draw from the actual base quality distribution.
sumOfQualities = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum();
} else {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you profiled this change, in particular when we care about extremely deep data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests are a bit long now, I'm trying to resolve the issue.

// If the number of alt reads is "large" we draw from a Gaussian approximation of the base
// quality distribution to speed up the code.
sumOfQualities = drawSumOfQScores(altDepth, averageQuality, standardDeviationQuality, randomNumberGenerator.nextGaussian());
int sumOfQualities;
// If the number of alt reads is "small" we draw from the actual base quality distribution.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it looks like you're getting rid of the Gaussian approximation at larger depth it seems that this comment is now moot.

sumOfQualities = 0;
for(int i = 0;i < altDepth;i++) {
if(rg.nextDouble() > overlapProbability) {
sumOfQualities += Math.min(qualityRW.draw(), maxOverlapBaseQuality);
} else {
// Simulate overlapping reads by sampling from the quality distribution twice.
// Since PCR errors affect overlapping reads, the total contribution to the sum
// of qualities cannot exceed the PCR error rate.
sumOfQualities += Math.min(qualityRW.draw() + qualityRW.draw(), maxOverlapBaseQuality);
}
}


if (isCalled(depth, altDepth, (double) sumOfQualities, alleleFraction, logOddsThreshold)) {
calledVariants++;
}
}
return (double) calledVariants / sampleSize;
}

/**
* Simulates the sum of base qualities taken from reads that support the alternate allele by
* taking advantage of the fact that the sum of draws from a distribution tends towards a
* Gaussian per the Central Limit Theorem.
* @param altDepth Number of draws to take from base quality distribution
* @param averageQuality Average quality of alt bases
* @param standardDeviationQuality Sample standard deviation of base quality scores
* @param z number of standard deviation from the mean to take sum over
* @return Simulated sum of base qualities the support the alternate allele
*/
static int drawSumOfQScores(final int altDepth, final double averageQuality, final double standardDeviationQuality, final double z) {
return (int) (altDepth * averageQuality + z * Math.sqrt(altDepth) * standardDeviationQuality);
}

/**
* Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution at a constant
* depth.
Expand All @@ -305,8 +300,12 @@ static int drawSumOfQScores(final int altDepth, final double averageQuality, fin
* @param alleleFraction the allele fraction to evaluate sensitivity at
* @return Theoretical sensitivity for the given arguments at a constant depth.
*/
private static double sensitivityAtConstantDepth(final int depth, final Histogram<Integer> qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction) {
return sensitivityAtConstantDepth(depth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction, RANDOM_SEED);
private static double sensitivityAtConstantDepth(final int depth, final Histogram<Integer> qualityHistogram,
final double logOddsThreshold, final int sampleSize,
final double alleleFraction, final double overlapProbability,
final int pcrErrorRate) {
return sensitivityAtConstantDepth(depth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction,
RANDOM_SEED, overlapProbability, pcrErrorRate);
}

/**
Expand All @@ -317,10 +316,14 @@ private static double sensitivityAtConstantDepth(final int depth, final Histogra
* @param sampleSize the total number of simulations to run
* @param logOddsThreshold Log odds threshold necessary for variant to be called
* @param alleleFraction the allele fraction to evaluate sensitivity at
* @param overlapProbability the probability two bases from the same fragment are overlapping
* @param pcrErrorRate Upper bound for the effective base quality two bases in an overlapping read
* @return Theoretical sensitivity for the given arguments over a particular depth distribution.
*/
public static double theoreticalSensitivity(final Histogram<Integer> depthHistogram, final Histogram<Integer> qualityHistogram,
final int sampleSize, final double logOddsThreshold, final double alleleFraction) {
public static double theoreticalSensitivity(final Histogram<Integer> depthHistogram,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to keep this method overload, given that it is only used in tests? I get that it'd be an API change, but not sure how we feel about that. You're changing the API anyway with respect to sensitivityAtConstantDepth, so probably best to be consistent.

final Histogram<Integer> qualityHistogram, final int sampleSize,
final double logOddsThreshold, final double alleleFraction,
final double overlapProbability, final int pcrErrorRate) {
if (alleleFraction > 1.0 || alleleFraction < 0.0) {
throw new IllegalArgumentException("Allele fractions must be between 0 and 1.");
}
Expand All @@ -346,7 +349,8 @@ public static double theoreticalSensitivity(final Histogram<Integer> depthHistog
}
// Calculate sensitivity for a particular depth, and use trapezoid rule to integrate sensitivity.
final double left = right;
right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction);
right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize,
alleleFraction, overlapProbability, pcrErrorRate);
sensitivity += deltaDepthProbability * (left + right) / 2.0;
}
return sensitivity;
Expand Down Expand Up @@ -386,7 +390,11 @@ static double[] trimDistribution(final double[] distribution) {
* @param alleleFractions List of allele fractions to measure theoretical sensitivity over.
*/
public static List<TheoreticalSensitivityMetrics> calculateSensitivities(final int simulationSize,
final Histogram<Integer> depthHistogram, final Histogram<Integer> baseQHistogram, final List<Double> alleleFractions) {
final Histogram<Integer> depthHistogram,
final Histogram<Integer> baseQHistogram,
final List<Double> alleleFractions,
final double overlapProbability,
final int pcrErrorRate) {

final List<TheoreticalSensitivityMetrics> metricsOverVariousAlleleFractions = new ArrayList<>();
final double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2.
Expand All @@ -396,7 +404,8 @@ public static List<TheoreticalSensitivityMetrics> calculateSensitivities(final i
for (final double alleleFraction : alleleFractions) {
final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics();
theoreticalSensitivityMetrics.ALLELE_FRACTION = alleleFraction;
theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction);
theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY = TheoreticalSensitivity.theoreticalSensitivity(
depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction, overlapProbability, pcrErrorRate);
theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY));
theoreticalSensitivityMetrics.SAMPLE_SIZE = simulationSize;
theoreticalSensitivityMetrics.LOG_ODDS_THRESHOLD = logOddsThreshold;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ protected abstract COLLECTOR makeCollector(final Set<MetricAccumulationLevel> ac
@Argument(doc="Output for Theoretical Sensitivity metrics where the allele fractions are provided by the ALLELE_FRACTION argument.", optional = true)
public File THEORETICAL_SENSITIVITY_OUTPUT;

@Argument(doc="Phred scaled PCR Error Rate for Theoretical Sensitivity model.", optional = true)
public int PCR_ERROR_RATE = 45;

@Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true)
public List<Double> ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5));
/**
Expand Down Expand Up @@ -169,7 +172,11 @@ protected int doWork() {
// Write out theoretical sensitivity results.
final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION);

final double overlapProbability = ((PanelMetricsBase) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP;

List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE,
collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, overlapProbability, PCR_ERROR_RATE);
theoreticalSensitivityMetrics.addAllMetrics(tsm);
theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri

private static final double LOG_ODDS_THRESHOLD = 3.0;

private final File theoreticalSensitivityOutput = null;

private final int minimumMappingQuality;
private final int minimumBaseQuality;
private final boolean clipOverlappingReads;
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/picard/IntelInflaterDeflaterLoadTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public void testIntelDeflaterIsAvailable() {
}

private void checkIntelSupported(final String componentName) {
if (!SystemUtils.IS_OS_LINUX && !SystemUtils.IS_OS_MAC) {
if ((!SystemUtils.IS_OS_LINUX && !SystemUtils.IS_OS_MAC)) {
throw new SkipException(componentName + " is not available on this platform");
}

Expand Down
Loading