Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Account for overlapping reads in Theoretical Sensitivity model #1802

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
10 changes: 8 additions & 2 deletions src/main/java/picard/analysis/CollectWgsMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ public class CollectWgsMetrics extends CommandLineProgram {
@Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true)
public List<Double> ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5));

@Argument(doc="Maximum value allowed for base quality resulting from overlapping reads. Often this is governed by PCR error rate. (Phred scale)", optional = true)
public int MAX_OVERLAP_BASE_QUAL = 45;

@Argument(doc = "If true, fast algorithm is used.")
public boolean USE_FAST_ALGORITHM = false;

Expand Down Expand Up @@ -242,8 +245,11 @@ protected int doWork() {
if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
// Write out theoretical sensitivity results.
final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION);
log.info("Calculating theoretical sensitivity at " + ALLELE_FRACTION.size() + " allele fractions.");

List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE,
collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION,
collector.getMetrics(dupeFilter, adapterFilter, mapqFilter, pairFilter).PCT_EXC_OVERLAP, MAX_OVERLAP_BASE_QUAL);
theoreticalSensitivityMetrics.addAllMetrics(tsm);
theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
}
Expand Down
147 changes: 95 additions & 52 deletions src/main/java/picard/analysis/TheoreticalSensitivity.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ protected abstract COLLECTOR makeCollector(final Set<MetricAccumulationLevel> ac
@Argument(doc="Output for Theoretical Sensitivity metrics where the allele fractions are provided by the ALLELE_FRACTION argument.", optional = true)
public File THEORETICAL_SENSITIVITY_OUTPUT;

@Argument(doc="Phred scaled PCR Error Rate for Theoretical Sensitivity model.", optional = true)
public int PCR_ERROR_RATE = 45;

@Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true)
public List<Double> ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5));
/**
Expand Down Expand Up @@ -169,7 +172,11 @@ protected int doWork() {
// Write out theoretical sensitivity results.
final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION);

final double overlapProbability = ((PanelMetricsBase) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP;

List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE,
collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, overlapProbability, PCR_ERROR_RATE);
theoreticalSensitivityMetrics.addAllMetrics(tsm);
theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ public abstract class TargetMetricsCollector<METRIC_TYPE extends MultilevelMetri

private static final double LOG_ODDS_THRESHOLD = 3.0;

private final File theoreticalSensitivityOutput = null;

private final int minimumMappingQuality;
private final int minimumBaseQuality;
private final boolean clipOverlappingReads;
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/picard/IntelInflaterDeflaterLoadTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public void testIntelDeflaterIsAvailable() {
}

private void checkIntelSupported(final String componentName) {
if (!SystemUtils.IS_OS_LINUX && !SystemUtils.IS_OS_MAC) {
if ((!SystemUtils.IS_OS_LINUX && !SystemUtils.IS_OS_MAC)) {
throw new SkipException(componentName + " is not available on this platform");
}

Expand Down
179 changes: 128 additions & 51 deletions src/test/java/picard/analysis/TheoreticalSensitivityTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import java.io.File;
import java.io.FileReader;
import java.util.*;
import java.util.stream.IntStream;

/**
* Created by davidben on 5/18/15.
Expand Down Expand Up @@ -254,28 +253,43 @@ public void testSensitivityAtConstantDepth(final double expected, final File met

// We ensure that even using different random seeds we converge to roughly the same value.
for (int i = 0; i < 3; i++) {
double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3, sampleSize, alleleFraction, i);
double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3,
sampleSize, alleleFraction, i, 0.0, 45);
Assert.assertEquals(result, expected, tolerance);
}
}

@DataProvider(name = "TheoreticalSensitivityDataProvider")
public Object[][] arbFracSensDataProvider() {
final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics");
final File hsMetricsHighDepthFile = new File(TEST_DIR, "ultra_high_depth.wgs_metrics");

// This test acts primarily as an integration test. The sample sizes
// are not quite large enough to converge properly, but is used for the purpose of
// keeping the compute time of the tests short.
return new Object[][]{
{0.90, wgsMetricsFile, 0.5, 400},
{0.78, wgsMetricsFile, 0.3, 400},
{0.29, wgsMetricsFile, 0.1, 500},
{0.08, wgsMetricsFile, 0.05, 500},
{0.90, wgsMetricsFile, 0.5, 400, false, 45},
{0.78, wgsMetricsFile, 0.3, 400, false, 45},
{0.29, wgsMetricsFile, 0.1, 500, false, 45},
{0.08, wgsMetricsFile, 0.05, 500, false, 45},

{0.90, wgsMetricsFile, 0.5, 400, true, 45},
{0.80, wgsMetricsFile, 0.3, 400, true, 45},
{0.35, wgsMetricsFile, 0.1, 500, true, 45},
{0.12, wgsMetricsFile, 0.05, 500, true, 45},

// This doesn't seem like a good strategy. I think
// I need to create tests that test a particular distribution of base quals and
// depth dist rather than reading it from a metrics file. I also should use overlap probability.
//{0.50, hsMetricsHighDepthFile, 0.01, 400, true, 45},
//{0.50, hsMetricsHighDepthFile, 0.01, 400, true, 90}
};
}

@Test(dataProvider = "TheoreticalSensitivityDataProvider")
public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, final int sampleSize) throws Exception {
public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction,
final int sampleSize, final boolean useOverlapProbability,
final int pcrErrorRate) throws Exception {
// This tests Theoretical Sensitivity using distributions on both base quality scores
// and the depth histogram.

Expand All @@ -292,11 +306,115 @@ public void testSensitivity(final double expected, final File metricsFile, final
final Histogram<Integer> depthHistogram = histograms.get(0);
final Histogram<Integer> qualityHistogram = histograms.get(1);

final double result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction);
// Get overlap probability from PCT_EXC_OVERLAP
final double overlapProbability = ((WgsMetrics) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP;
final double result;
if (useOverlapProbability) {
result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability, pcrErrorRate);
}
else {
result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, 0, pcrErrorRate);
}

Assert.assertEquals(result, expected, tolerance);
}

@DataProvider(name = "fixedDistributionsDataProvider")
public Object[][] fixedDistributionsDataProvider() {
final Histogram<Integer> depthHistogram30X = new Histogram<>();
depthHistogram30X.increment(30, 100);

final Histogram<Integer> qualityHistogramAllQ30 = new Histogram<>();
qualityHistogramAllQ30.increment(30, 100);

final Histogram<Integer> depthHistogram1000X = new Histogram<>();
depthHistogram1000X.increment(1000, 1);

// Something is strange about how the depth distribution is acting.
// The quality distribution seems to be acting fine, but the depth
// dist seems to act as if there is an off by 1 error with a bin
// set at 0 depth, and reducing the sensitivity.

// I think there is a bug in the trapz code, seems very off by oneish.
// I also think there is something wrong with the model because it isn't monotonic with
// increasing fixed depth... WTF is going on there?

final Histogram<Integer> qualityHistogramAllQ60 = new Histogram<>();
qualityHistogramAllQ60.increment(60, 100);

return new Object[][]{
{0.58, depthHistogram30X, qualityHistogramAllQ30, 400, 0.1, 45},
{0.58, depthHistogram30X, qualityHistogramAllQ30, 400, 0.1, 45},
{1.00, depthHistogram1000X, qualityHistogramAllQ60, 400, 0.5, 60},
{1.00, depthHistogram1000X, qualityHistogramAllQ60, 400, 0.1, 105},

};
}

@Test(dataProvider = "fixedDistributionsDataProvider")
public void testSensitivityOnFixedDistributions(final double expected,
final Histogram depthHistogram, final Histogram qualityHistogram,
final int sampleSize, final double alleleFraction,
final int pcrErrorRate) {

double actual = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize,
3, alleleFraction, 0.0, pcrErrorRate);

Assert.assertEquals(actual, expected, 0.01);
}

@Test
public void isMonotonicFixedDepth() {
final Histogram<Integer> qualityHistogram = new Histogram<>();
qualityHistogram.increment(20, 100);

final Histogram<Integer> depthHistogram = new Histogram<>();
depthHistogram.increment(20, 1);

int sampleSize = 100000;
int logOddsThreshold = 10;
double overlapProbability = 0.0;
int pcrErrorRate = 45;

double lastSensitivity = 0.0;
double currentSensitivity;
for(double alleleFraction = 0.49;alleleFraction <= 0.49;alleleFraction += 0.01) {
currentSensitivity = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize,
logOddsThreshold, alleleFraction, overlapProbability, pcrErrorRate);
System.out.println("alleleFraction = " + alleleFraction + " currentSensitivity = " + currentSensitivity + " last = " + lastSensitivity);

Assert.assertTrue(currentSensitivity >= lastSensitivity);
lastSensitivity = currentSensitivity;
}
}

@Test
public void isMonotonicFixedAF() {
final double alleleFraction = 0.3;
final Histogram<Integer> qualityHistogram = new Histogram<>();
qualityHistogram.increment(30, 100);


int sampleSize = 100000;
int logOddsThreshold = 10;
double overlapProbability = 0.0;
int pcrErrorRate = 45;

double lastSensitivity = 0.0;
double currentSensitivity;
for(int currentDepth = 20;currentDepth <= 21;currentDepth++) {
final Histogram<Integer> depthHistogram = new Histogram<>();
depthHistogram.increment(currentDepth, 1);

currentSensitivity = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize,
logOddsThreshold, alleleFraction, overlapProbability, pcrErrorRate);
System.out.println("alleleFraction = " + alleleFraction + " currentSensitivity = " + currentSensitivity + " last = " + lastSensitivity);

Assert.assertTrue(currentSensitivity >= lastSensitivity);
lastSensitivity = currentSensitivity;
}
}

@DataProvider(name = "equivalanceHetVsArbitrary")
public Object[][] equivalenceHetVsFull() {
final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics");
Expand Down Expand Up @@ -327,7 +445,8 @@ public void testHetVsArbitrary(final File metricsFile, final double tolerance, f
final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram);
final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram);

final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, 0.5);
// TS is TheoreticalSensitivity, THS is TheoreticalHetSensitivity
final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, 0.5, 0.0, 45);
final double resultFromTHS = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, 3);

Assert.assertEquals(resultFromTS, resultFromTHS, tolerance);
Expand Down Expand Up @@ -356,48 +475,6 @@ public void testCallingThreshold(final int totalDepth, final int altDepth, final
Assert.assertEquals(TheoreticalSensitivity.isCalled(totalDepth, altDepth, sumOfAltQualities, alleleFraction, logOddsThreshold), expectedCall);
}

@DataProvider(name = "sumOfGaussiansDataProvider")
public Object[][] sumOfGaussians() {
final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics");
final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics");

// When we sum more base qualities from a particular distribution, it should look increasingly Gaussian.
return new Object[][]{
{wgsMetricsFile, 500, 0.03},
{wgsMetricsFile, 20, 0.05},
{wgsMetricsFile, 10, 0.10},
{targetedMetricsFile, 500, 0.03},
{targetedMetricsFile, 20, 0.05},
{targetedMetricsFile, 10, 0.10}
};
}

@Test(dataProvider = "sumOfGaussiansDataProvider")
public void testDrawSumOfQScores(final File metricsFile, final int altDepth, final double tolerance) throws Exception {
final MetricsFile<TheoreticalSensitivityMetrics, Integer> metrics = new MetricsFile<>();
try (final FileReader metricsFileReader = new FileReader(metricsFile)) {
metrics.read(metricsFileReader);
}

final List<Histogram<Integer>> histograms = metrics.getAllHistograms();

final Histogram<Integer> qualityHistogram = histograms.get(1);
final TheoreticalSensitivity.RouletteWheel qualityRW = new TheoreticalSensitivity.RouletteWheel(TheoreticalSensitivity.trimDistribution(TheoreticalSensitivity.normalizeHistogram(qualityHistogram)));

final Random randomNumberGenerator = new Random(51);

// Calculate mean and deviation of quality score distribution to enable Gaussian sampling below
final double averageQuality = qualityHistogram.getMean();
final double standardDeviationQuality = qualityHistogram.getStandardDeviation();

for (int k = 0; k < 1; k++) {
int sumOfQualitiesFull = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum();
int sumOfQualities = TheoreticalSensitivity.drawSumOfQScores(altDepth, averageQuality, standardDeviationQuality, randomNumberGenerator.nextGaussian());

Assert.assertEquals(sumOfQualitiesFull, sumOfQualities, sumOfQualitiesFull * tolerance);
}
}

@DataProvider(name = "trimDistributionDataProvider")
public Object[][] trimDistributions() {
return new Object[][]{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ public void testCoverageHistogram() throws IOException {
"MINIMUM_MAPPING_QUALITY=" + minimumMappingQuality,
"MINIMUM_BASE_QUALITY=" + minimumBaseQuality,
"CLIP_OVERLAPPING_READS=" + clipOverlappingReads,
"SAMPLE_SIZE=" + sampleSize
"SAMPLE_SIZE=" + sampleSize,
"THEORETICAL_SENSITIVITY_OUTPUT=" + "ts.out"
};

Assert.assertEquals(runPicardCommandLine(args), 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,26 @@ public Object[][] theoreticalSensitivityDataProvider() {
// the tests from taking too long to run.
{tempSamFile, outfile, tsOutfile, perTargetOutfile, referenceFile, singleIntervals, 2000,
Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction
Arrays.asList(0.01, 0.54, 0.93, 0.99, 0.99) // Expected sensitivity
Arrays.asList(0.01, 0.54, 0.93, 0.99, 0.99), // Expected sensitivity
45 // PCR Error Rate
},

// This is the same as the previous test but uses a lower value for the pcrErrorRate.
// The important thing captured here is that the expected sensitivities should be
// less than or equal to the values from the previous test.
{tempSamFile, outfile, tsOutfile, perTargetOutfile, referenceFile, singleIntervals, 2000,
Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction
Arrays.asList(0.00, 0.31, 0.87, 0.99, 0.99), // Expected sensitivity
10 // PCR Error Rate
}
};
}

@Test(dataProvider = "theoreticalSensitivityDataProvider")
public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input, final File outfile, final File tsOutfile, final File perTargetOutfile, final String referenceFile,
final String targetIntervals, final int sampleSize, final List<Double> alleleFractions, final List<Double> expectedSensitivities) throws IOException {
final String targetIntervals, final int sampleSize,
final List<Double> alleleFractions, final List<Double> expectedSensitivities,
final int pcrErrorRate) throws IOException {

final String[] args = new String[] {
"TARGET_INTERVALS=" + targetIntervals,
Expand All @@ -184,6 +196,7 @@ public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input
"LEVEL=ALL_READS",
"AMPLICON_INTERVALS=" + targetIntervals,
"THEORETICAL_SENSITIVITY_OUTPUT=" + tsOutfile.getAbsolutePath(),
"PCR_ERROR_RATE=" + pcrErrorRate,
"SAMPLE_SIZE=" + sampleSize
};

Expand Down