зеркало из https://github.com/dotnetbio/bio.git
Add Parser For PacBio CCS BAM
This adds a parser for the new PacBio CCS data BAM format. It wraps the already existing BAM files and converts the SAMAlignedSequence type into a PacBioCCSRead type for less memory storage and easier access. Tests to verify the behavior are included.
This commit is contained in:
Родитель
0cbd8fe25a
Коммит
0f45347362
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
|
||||
<PropertyGroup>
|
||||
<MinimumVisualStudioVersion>10.0</MinimumVisualStudioVersion>
|
||||
|
@ -431,6 +431,8 @@
|
|||
<Compile Include="Util\UOPair.cs" />
|
||||
<Compile Include="ValueConverter.cs" />
|
||||
<Compile Include="WordMatch.cs" />
|
||||
<Compile Include="IO\PacBio\PacBioCCSRead.cs" />
|
||||
<Compile Include="IO\PacBio\PacBioCCSBamReader.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<EmbeddedResource Include="CloneLibrary\Resources\Library.txt" />
|
||||
|
@ -449,7 +451,7 @@
|
|||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Shims\Bio.Platform.Helpers\Bio.Platform.Helpers.csproj">
|
||||
<Project>{83cbb611-4049-456a-b87b-0b2cfc4797fa}</Project>
|
||||
<Project>{83CBB611-4049-456A-B87B-0B2CFC4797FA}</Project>
|
||||
<Name>Bio.Platform.Helpers</Name>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
|
@ -474,4 +476,7 @@ cp "$(TargetDir)\$(TargetName).xml" "$(SolutionDir)\bin\$(TargetName).xml"</Post
|
|||
<Target Name="AfterBuild">
|
||||
</Target>
|
||||
-->
|
||||
<ItemGroup>
|
||||
<Folder Include="IO\PacBio\" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,56 @@
|
|||
using System;
|
||||
using Bio;
|
||||
using Bio.IO.BAM;
|
||||
using Bio.IO.SAM;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.IO;
|
||||
|
||||
namespace Bio.IO.PacBio
|
||||
{
|
||||
/// <summary>
|
||||
/// Reads the data from a PacBio BAM file produced by the pbccs program.
|
||||
/// </summary>
|
||||
public class PacBioCCSBamReader
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse the CCS reads in a PacBio CCS BAM File.
|
||||
/// </summary>
|
||||
/// <returns>The CCS reads.</returns>
|
||||
/// <param name="stream">A stream to parse.</param>
|
||||
public static IEnumerable<PacBioCCSRead> Parse(Stream stream) {
|
||||
|
||||
if (stream == null)
|
||||
{
|
||||
throw new ArgumentNullException("stream");
|
||||
}
|
||||
BAMParser bp = new BAMParser ();
|
||||
var header = bp.GetHeader (stream);
|
||||
var field = header.RecordFields.ToList();
|
||||
|
||||
var pg = field.Where( p => p.Typecode=="PG").FirstOrDefault();
|
||||
if (pg == null) {
|
||||
throw new ArgumentException ("BAM file did not contain a 'PG' tag in header");
|
||||
}
|
||||
|
||||
var cl = pg.Tags.Where (z => z.Tag == "CL").FirstOrDefault ();
|
||||
if (cl == null) {
|
||||
throw new ArgumentException ("BAM file did not contain a 'CL' tag within the 'PG' group in header.");
|
||||
}
|
||||
|
||||
var cmd = cl.Value;
|
||||
if(!cmd.StartsWith("ccs")) {
|
||||
throw new ArgumentException("This is not a BAM file produced by the ccs command.");
|
||||
}
|
||||
|
||||
|
||||
|
||||
stream.Seek(0, SeekOrigin.Begin);
|
||||
foreach (var s in bp.Parse (stream)) {
|
||||
yield return new PacBioCCSRead (s as SAMAlignedSequence);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
using System;
|
||||
using Bio;
|
||||
using Bio.IO.BAM;
|
||||
using Bio.IO.SAM;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
/// <summary>
|
||||
/// Classes and methods to read and write PacBio data.
|
||||
/// </summary>
|
||||
namespace Bio.IO.PacBio
|
||||
{
|
||||
/// <summary>
|
||||
/// A class representing the consensus sequence generated from multiple subreads. This is produced
|
||||
/// by the program pbccs and output to a BAM file.
|
||||
/// </summary>
|
||||
public class PacBioCCSRead
|
||||
{
|
||||
/// <summary>
|
||||
/// A measure of CCS read quality, currently capped at 99.9% (QV30)
|
||||
/// </summary>
|
||||
public readonly float ReadQuality;
|
||||
|
||||
/// <summary>
|
||||
/// The read group.
|
||||
/// </summary>
|
||||
public readonly string ReadGroup;
|
||||
|
||||
/// <summary>
|
||||
/// The number of subreads that were used to generate the consensus sequence.
|
||||
/// </summary>
|
||||
public readonly int NumPasses;
|
||||
|
||||
/// <summary>
|
||||
/// The outcome counts for results of adding each subread. Most subreads
|
||||
/// should be correctly added, but some may be dropped due to various
|
||||
/// exceptions enumerated here.
|
||||
/// </summary>
|
||||
private readonly int[] statusCounts;
|
||||
|
||||
/// <summary>
|
||||
/// Count of subreads successfully added for consensus generation.
|
||||
/// </summary>
|
||||
public int ReadCountSuccessfullyAdded {
|
||||
get{ return statusCounts [0]; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Count of subreads not added to consensus for failing alpha/beta mismatch check.
|
||||
/// </summary>
|
||||
/// <value>The ReadCount alpha beta mismatch.</value>
|
||||
public int ReadCountAlphaBetaMismatch {
|
||||
get { return statusCounts [1]; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Count of subreads not added to consensus for allocating too much memory.
|
||||
/// </summary>
|
||||
/// <value>The ReadCount mem fail.</value>
|
||||
public int ReadCountMemFail {
|
||||
get { return statusCounts [2]; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Count of subreads not added to consensus for having too low a Z-score.
|
||||
/// </summary>
|
||||
/// <value>The ReadCount bad zscore.</value>
|
||||
public int ReadCountBadZscore {
|
||||
get { return statusCounts [3]; }
|
||||
}
|
||||
public int ReadCountOther {
|
||||
get{ return statusCounts [4]; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The SNR of the A Channel
|
||||
/// </summary>
|
||||
public readonly float SnrA;
|
||||
|
||||
/// <summary>
|
||||
/// The SNR of the C Channel
|
||||
/// </summary>
|
||||
public readonly float SnrC;
|
||||
|
||||
/// <summary>
|
||||
/// The SNR of the G Channel
|
||||
/// </summary>
|
||||
public readonly float SnrG;
|
||||
|
||||
/// <summary>
|
||||
/// The SNR of the T Channel
|
||||
/// </summary>
|
||||
public readonly float SnrT;
|
||||
|
||||
/// <summary>
|
||||
/// The name of the movie this CCS read came from.
|
||||
/// </summary>
|
||||
public string Movie;
|
||||
|
||||
/// <summary>
|
||||
/// What is the hole number for the ZMW.
|
||||
/// </summary>
|
||||
public int HoleNumber;
|
||||
|
||||
/// <summary>
|
||||
/// The average Z-score for all subreads.
|
||||
/// </summary>
|
||||
public float AvgZscore;
|
||||
|
||||
/// <summary>
|
||||
/// An array of Z-scores for each subread added. Subreads that were not added are report as
|
||||
/// Double.NaN values.
|
||||
/// </summary>
|
||||
public readonly float[] ZScores;
|
||||
|
||||
/// <summary>
|
||||
/// The consensus sequence and associated QV values.
|
||||
/// </summary>
|
||||
public QualitativeSequence Sequence;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file.
|
||||
/// </summary>
|
||||
/// <param name="s">S.</param>
|
||||
public PacBioCCSRead (SAMAlignedSequence s)
|
||||
{
|
||||
/* TODO: Converting from binary to string and back is beyond silly...
|
||||
* no performance hit worth worrying about at present, but in the future it might be worth
|
||||
* going directly from binary to the type rather than through string intermediates */
|
||||
foreach (var v in s.OptionalFields) {
|
||||
if (v.Tag == "sn") {
|
||||
var snrs = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
|
||||
SnrA = snrs [0];
|
||||
SnrC = snrs [1];
|
||||
SnrG = snrs [2];
|
||||
SnrT = snrs [3];
|
||||
} else if (v.Tag == "zm") {
|
||||
HoleNumber = (int)Convert.ToInt32 (v.Value);
|
||||
} else if (v.Tag == "rq") {
|
||||
ReadQuality = Convert.ToInt32 (v.Value) / 1000.0f;
|
||||
} else if (v.Tag == "za") {
|
||||
AvgZscore = (float)Convert.ToSingle (v.Value);
|
||||
} else if (v.Tag == "rs") {
|
||||
statusCounts = v.Value.Split (',').Skip (1).Select (x => Convert.ToInt32 (x)).ToArray ();
|
||||
} else if (v.Tag == "np") {
|
||||
NumPasses = Convert.ToInt32 (v.Value);
|
||||
} else if (v.Tag == "RG") {
|
||||
ReadGroup = v.Value;
|
||||
} else if (v.Tag == "zs") {
|
||||
ZScores = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
|
||||
}
|
||||
}
|
||||
// TODO: We should use String.Intern here, but not available in PCL...
|
||||
// Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]);
|
||||
Movie = s.QuerySequence.ID.Split ('/') [0];
|
||||
Sequence = s.QuerySequence as QualitativeSequence;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using Bio.IO.BAM;
|
||||
using Bio.IO.SAM;
|
||||
|
||||
namespace Bio.IO.PacBio
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// Parser extensions for the PacBio BAM parsers.
|
||||
/// </summary>
|
||||
public static class BAMParserExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns an iterator over a set of SAMAlignedSequences retrieved from a parsed BAM file.
|
||||
/// </summary>
|
||||
/// <param name="parser">Parser</param>
|
||||
/// <param name="filename">Filename</param>
|
||||
/// <returns>IEnumerable SAMAlignedSequence object.</returns>
|
||||
public static IEnumerable<PacBioCCSRead> Parse(this PacBioCCSBamReader parser, string filename)
|
||||
{
|
||||
if (parser == null)
|
||||
{
|
||||
throw new ArgumentNullException("parser");
|
||||
}
|
||||
if (filename == null)
|
||||
{
|
||||
throw new ArgumentNullException("filename");
|
||||
}
|
||||
|
||||
using (var fs = File.OpenRead(filename))
|
||||
{
|
||||
foreach (var item in PacBioCCSBamReader.Parse(fs))
|
||||
yield return item;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -30,5 +30,6 @@
|
|||
<Compile Include="$(MSBuildThisFileDirectory)Util\CommentedStreamReader.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)Util\FileAccessComparer.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)Util\FileUtils.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)Extensions\PacBioParserExtensions.cs" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -217,6 +217,7 @@
|
|||
<Compile Include="Algorithms\MUMmer\MUMmerAlignerTests.cs" />
|
||||
<Compile Include="Algorithms\Padena\PadenaBvtTestCases.cs" />
|
||||
<Compile Include="Algorithms\Padena\PadenaP1TestCases.cs" />
|
||||
<Compile Include="PacBio\ParserTests.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Content Include="..\TestData\Testcases.xml">
|
||||
|
@ -266,4 +267,7 @@
|
|||
|
||||
-->
|
||||
<ItemGroup />
|
||||
<ItemGroup>
|
||||
<Folder Include="PacBio\" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,37 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using NUnit.Framework;
|
||||
using Bio.IO.PacBio;
|
||||
|
||||
namespace Bio.Tests.PacBio
|
||||
{
|
||||
[TestFixture]
|
||||
public static class ParserTests
|
||||
{
|
||||
[Test]
|
||||
[Category("PacBio")]
|
||||
public static void SimpleCCSTest ()
|
||||
{
|
||||
string fname = System.IO.Path.Combine ("TestUtils", "PacBio", "ccs.bam");
|
||||
var csp = new PacBioCCSBamReader ();
|
||||
var seqs = csp.Parse (fname).ToList();
|
||||
|
||||
var seq4 = seqs [4];
|
||||
Assert.AreEqual (146331, seq4.HoleNumber);
|
||||
Assert.AreEqual (124, seq4.NumPasses);
|
||||
Assert.AreEqual (2, seq4.ReadCountBadZscore);
|
||||
Assert.AreEqual (136, seq4.Sequence.Count);
|
||||
Assert.AreEqual (128, seq4.ZScores.Length);
|
||||
Assert.AreEqual("m141008_060349_42194_c100704972550000001823137703241586_s1_p0/146331/ccs",
|
||||
seq4.Sequence.ID);
|
||||
var seq = new Sequence (DnaAlphabet.Instance, seq4.Sequence.ToArray (), true);
|
||||
Assert.AreEqual("CCCGGGGATCCTCTAGAATGCTCATACACTGGGGGATACATATACGGGGGGGGGCACATCATCTAGACAGACGACTTTTTTTTTTCGAGCGCAGCTTTTTGAGCGACGCACAAGCTTGCTGAGGACTAGTAGCTTC",
|
||||
seq.ConvertToString());
|
||||
|
||||
Assert.AreEqual (seqs.Count, 7);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2183,5 +2183,11 @@
|
|||
<None Include="$(MSBuildThisFileDirectory)TestUtils\Wiggle\variable.wig">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Include="$(MSBuildThisFileDirectory)TestUtils\PacBio\ccs.bam">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Folder Include="$(MSBuildThisFileDirectory)TestUtils\PacBio\" />
|
||||
</ItemGroup>
|
||||
</Project>
|
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче