This adds a parser for the new PacBio CCS data BAM format.  It wraps
the already existing BAM files and converts the SAMAlignedSequence type
into a PacBioCCSRead type for less memory storage and easier access.
Tests to verify the behavior are included.
This commit is contained in:
Nigel Delaney 2015-08-03 00:10:43 -07:00
Родитель 0cbd8fe25a
Коммит 0f45347362
9 изменённых файлов: 311 добавлений и 2 удалений

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<MinimumVisualStudioVersion>10.0</MinimumVisualStudioVersion>
@ -431,6 +431,8 @@
<Compile Include="Util\UOPair.cs" />
<Compile Include="ValueConverter.cs" />
<Compile Include="WordMatch.cs" />
<Compile Include="IO\PacBio\PacBioCCSRead.cs" />
<Compile Include="IO\PacBio\PacBioCCSBamReader.cs" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="CloneLibrary\Resources\Library.txt" />
@ -449,7 +451,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Shims\Bio.Platform.Helpers\Bio.Platform.Helpers.csproj">
<Project>{83cbb611-4049-456a-b87b-0b2cfc4797fa}</Project>
<Project>{83CBB611-4049-456A-B87B-0B2CFC4797FA}</Project>
<Name>Bio.Platform.Helpers</Name>
</ProjectReference>
</ItemGroup>
@ -474,4 +476,7 @@ cp "$(TargetDir)\$(TargetName).xml" "$(SolutionDir)\bin\$(TargetName).xml"</Post
<Target Name="AfterBuild">
</Target>
-->
<ItemGroup>
<Folder Include="IO\PacBio\" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,56 @@
using System;
using Bio;
using Bio.IO.BAM;
using Bio.IO.SAM;
using System.Collections.Generic;
using System.Linq;
using System.IO;
namespace Bio.IO.PacBio
{
/// <summary>
/// Reads the data from a PacBio BAM file produced by the pbccs program.
/// </summary>
public class PacBioCCSBamReader
{
/// <summary>
/// Parse the CCS reads in a PacBio CCS BAM File.
/// </summary>
/// <returns>The CCS reads.</returns>
/// <param name="stream">A stream to parse.</param>
public static IEnumerable<PacBioCCSRead> Parse(Stream stream) {
if (stream == null)
{
throw new ArgumentNullException("stream");
}
BAMParser bp = new BAMParser ();
var header = bp.GetHeader (stream);
var field = header.RecordFields.ToList();
var pg = field.Where( p => p.Typecode=="PG").FirstOrDefault();
if (pg == null) {
throw new ArgumentException ("BAM file did not contain a 'PG' tag in header");
}
var cl = pg.Tags.Where (z => z.Tag == "CL").FirstOrDefault ();
if (cl == null) {
throw new ArgumentException ("BAM file did not contain a 'CL' tag within the 'PG' group in header.");
}
var cmd = cl.Value;
if(!cmd.StartsWith("ccs")) {
throw new ArgumentException("This is not a BAM file produced by the ccs command.");
}
stream.Seek(0, SeekOrigin.Begin);
foreach (var s in bp.Parse (stream)) {
yield return new PacBioCCSRead (s as SAMAlignedSequence);
}
}
}
}

Просмотреть файл

@ -0,0 +1,160 @@
using System;
using Bio;
using Bio.IO.BAM;
using Bio.IO.SAM;
using System.Collections.Generic;
using System.Linq;
/// <summary>
/// Classes and methods to read and write PacBio data.
/// </summary>
namespace Bio.IO.PacBio
{
/// <summary>
/// A class representing the consensus sequence generated from multiple subreads. This is produced
/// by the program pbccs and output to a BAM file.
/// </summary>
public class PacBioCCSRead
{
/// <summary>
/// A measure of CCS read quality, currently capped at 99.9% (QV30)
/// </summary>
public readonly float ReadQuality;
/// <summary>
/// The read group.
/// </summary>
public readonly string ReadGroup;
/// <summary>
/// The number of subreads that were used to generate the consensus sequence.
/// </summary>
public readonly int NumPasses;
/// <summary>
/// The outcome counts for results of adding each subread. Most subreads
/// should be correctly added, but some may be dropped due to various
/// exceptions enumerated here.
/// </summary>
private readonly int[] statusCounts;
/// <summary>
/// Count of subreads successfully added for consensus generation.
/// </summary>
public int ReadCountSuccessfullyAdded {
get{ return statusCounts [0]; }
}
/// <summary>
/// Count of subreads not added to consensus for failing alpha/beta mismatch check.
/// </summary>
/// <value>The ReadCount alpha beta mismatch.</value>
public int ReadCountAlphaBetaMismatch {
get { return statusCounts [1]; }
}
/// <summary>
/// Count of subreads not added to consensus for allocating too much memory.
/// </summary>
/// <value>The ReadCount mem fail.</value>
public int ReadCountMemFail {
get { return statusCounts [2]; }
}
/// <summary>
/// Count of subreads not added to consensus for having too low a Z-score.
/// </summary>
/// <value>The ReadCount bad zscore.</value>
public int ReadCountBadZscore {
get { return statusCounts [3]; }
}
public int ReadCountOther {
get{ return statusCounts [4]; }
}
/// <summary>
/// The SNR of the A Channel
/// </summary>
public readonly float SnrA;
/// <summary>
/// The SNR of the C Channel
/// </summary>
public readonly float SnrC;
/// <summary>
/// The SNR of the G Channel
/// </summary>
public readonly float SnrG;
/// <summary>
/// The SNR of the T Channel
/// </summary>
public readonly float SnrT;
/// <summary>
/// The name of the movie this CCS read came from.
/// </summary>
public string Movie;
/// <summary>
/// What is the hole number for the ZMW.
/// </summary>
public int HoleNumber;
/// <summary>
/// The average Z-score for all subreads.
/// </summary>
public float AvgZscore;
/// <summary>
/// An array of Z-scores for each subread added. Subreads that were not added are report as
/// Double.NaN values.
/// </summary>
public readonly float[] ZScores;
/// <summary>
/// The consensus sequence and associated QV values.
/// </summary>
public QualitativeSequence Sequence;
/// <summary>
/// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file.
/// </summary>
/// <param name="s">S.</param>
public PacBioCCSRead (SAMAlignedSequence s)
{
/* TODO: Converting from binary to string and back is beyond silly...
* no performance hit worth worrying about at present, but in the future it might be worth
* going directly from binary to the type rather than through string intermediates */
foreach (var v in s.OptionalFields) {
if (v.Tag == "sn") {
var snrs = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
SnrA = snrs [0];
SnrC = snrs [1];
SnrG = snrs [2];
SnrT = snrs [3];
} else if (v.Tag == "zm") {
HoleNumber = (int)Convert.ToInt32 (v.Value);
} else if (v.Tag == "rq") {
ReadQuality = Convert.ToInt32 (v.Value) / 1000.0f;
} else if (v.Tag == "za") {
AvgZscore = (float)Convert.ToSingle (v.Value);
} else if (v.Tag == "rs") {
statusCounts = v.Value.Split (',').Skip (1).Select (x => Convert.ToInt32 (x)).ToArray ();
} else if (v.Tag == "np") {
NumPasses = Convert.ToInt32 (v.Value);
} else if (v.Tag == "RG") {
ReadGroup = v.Value;
} else if (v.Tag == "zs") {
ZScores = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
}
}
// TODO: We should use String.Intern here, but not available in PCL...
// Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]);
Movie = s.QuerySequence.ID.Split ('/') [0];
Sequence = s.QuerySequence as QualitativeSequence;
}
}
}

Просмотреть файл

@ -0,0 +1,40 @@
using System;
using System.Collections.Generic;
using System.IO;
using Bio.IO.BAM;
using Bio.IO.SAM;
namespace Bio.IO.PacBio
{
/// <summary>
/// Parser extensions for the PacBio BAM parsers.
/// </summary>
public static class BAMParserExtensions
{
/// <summary>
/// Returns an iterator over a set of SAMAlignedSequences retrieved from a parsed BAM file.
/// </summary>
/// <param name="parser">Parser</param>
/// <param name="filename">Filename</param>
/// <returns>IEnumerable SAMAlignedSequence object.</returns>
public static IEnumerable<PacBioCCSRead> Parse(this PacBioCCSBamReader parser, string filename)
{
if (parser == null)
{
throw new ArgumentNullException("parser");
}
if (filename == null)
{
throw new ArgumentNullException("filename");
}
using (var fs = File.OpenRead(filename))
{
foreach (var item in PacBioCCSBamReader.Parse(fs))
yield return item;
}
}
}
}

Просмотреть файл

@ -30,5 +30,6 @@
<Compile Include="$(MSBuildThisFileDirectory)Util\CommentedStreamReader.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Util\FileAccessComparer.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Util\FileUtils.cs" />
<Compile Include="$(MSBuildThisFileDirectory)Extensions\PacBioParserExtensions.cs" />
</ItemGroup>
</Project>

Просмотреть файл

@ -217,6 +217,7 @@
<Compile Include="Algorithms\MUMmer\MUMmerAlignerTests.cs" />
<Compile Include="Algorithms\Padena\PadenaBvtTestCases.cs" />
<Compile Include="Algorithms\Padena\PadenaP1TestCases.cs" />
<Compile Include="PacBio\ParserTests.cs" />
</ItemGroup>
<ItemGroup>
<Content Include="..\TestData\Testcases.xml">
@ -266,4 +267,7 @@
-->
<ItemGroup />
<ItemGroup>
<Folder Include="PacBio\" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,37 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using NUnit.Framework;
using Bio.IO.PacBio;
namespace Bio.Tests.PacBio
{
[TestFixture]
public static class ParserTests
{
[Test]
[Category("PacBio")]
public static void SimpleCCSTest ()
{
string fname = System.IO.Path.Combine ("TestUtils", "PacBio", "ccs.bam");
var csp = new PacBioCCSBamReader ();
var seqs = csp.Parse (fname).ToList();
var seq4 = seqs [4];
Assert.AreEqual (146331, seq4.HoleNumber);
Assert.AreEqual (124, seq4.NumPasses);
Assert.AreEqual (2, seq4.ReadCountBadZscore);
Assert.AreEqual (136, seq4.Sequence.Count);
Assert.AreEqual (128, seq4.ZScores.Length);
Assert.AreEqual("m141008_060349_42194_c100704972550000001823137703241586_s1_p0/146331/ccs",
seq4.Sequence.ID);
var seq = new Sequence (DnaAlphabet.Instance, seq4.Sequence.ToArray (), true);
Assert.AreEqual("CCCGGGGATCCTCTAGAATGCTCATACACTGGGGGATACATATACGGGGGGGGGCACATCATCTAGACAGACGACTTTTTTTTTTCGAGCGCAGCTTTTTGAGCGACGCACAAGCTTGCTGAGGACTAGTAGCTTC",
seq.ConvertToString());
Assert.AreEqual (seqs.Count, 7);
}
}
}

Просмотреть файл

@ -2183,5 +2183,11 @@
<None Include="$(MSBuildThisFileDirectory)TestUtils\Wiggle\variable.wig">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="$(MSBuildThisFileDirectory)TestUtils\PacBio\ccs.bam">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
<ItemGroup>
<Folder Include="$(MSBuildThisFileDirectory)TestUtils\PacBio\" />
</ItemGroup>
</Project>

Двоичные данные
src/Source/Tests/TestData/TestUtils/PacBio/ccs.bam Executable file

Двоичный файл не отображается.