Merge pull request #2 from dotnetbio/variantcaller

Add Bio.Variant Namespace
This commit is contained in:
Nigel Delaney 2015-08-04 19:43:15 -07:00
Родитель 885fef9064 3adad2773c
Коммит 4f207b370a
9 изменённых файлов: 1245 добавлений и 0 удалений

Просмотреть файл

@ -433,6 +433,12 @@
<Compile Include="WordMatch.cs" />
<Compile Include="IO\PacBio\PacBioCCSRead.cs" />
<Compile Include="IO\PacBio\PacBioCCSBamReader.cs" />
<Compile Include="Variant\AlignmentUtils.cs" />
<Compile Include="Variant\BPandQV.cs" />
<Compile Include="Variant\IndelVariant.cs" />
<Compile Include="Variant\SNPVariant.cs" />
<Compile Include="Variant\Variant.cs" />
<Compile Include="Variant\VariantCaller.cs" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="CloneLibrary\Resources\Library.txt" />
@ -478,5 +484,6 @@ cp "$(TargetDir)\$(TargetName).xml" "$(SolutionDir)\bin\$(TargetName).xml"</Post
-->
<ItemGroup>
<Folder Include="IO\PacBio\" />
<Folder Include="Variant\" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,265 @@
using System;
using Bio.Algorithms.Alignment;
using System.Collections.Generic;
using System.Linq;
using Bio.Extensions;
using System.Diagnostics;
namespace Bio.Variant
{
/// <summary>
/// Utilities to manipulate and work with alignments when calling variants.
/// </summary>
public static class AlignmentUtils
{
/// <summary>
/// The maximum number of times we can shift an indel before we fail.
/// It should never go anywhere near this number and this is only implemented as a guard.
/// </summary>
private const int MAX_LOOPS = 50;
private const string MAX_LOOPS_ERROR = "The Left-align indel step did not converge within 50 iterations";
/// <summary>
/// Verifies the alignment has no leading or trailing gaps and throws an exception otherwise.
/// </summary>
/// <param name="refseq">Refseq.</param>
/// <param name="query">Query.</param>
internal static void VerifyNoGapsOnEnds(byte[] refseq, BPandQV[] query) {
var gap = DnaAlphabet.Instance.Gap;
if (refseq [0] == gap ||
refseq [refseq.Length - 1] == gap ||
query [0].BP == gap ||
query [query.Length - 1].BP == gap) {
throw new FormatException ("Alignment query and/or reference started with a gap character. " +
"Alignments must be hard-clipped to remove starting and trailing variants " +
"before variants can be called");
}
}
/// <summary>
/// Given two byte arrays representing a pairwise alignment, shift them so
/// that all deletions start as early as possible. For example:
/// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT
/// TTTTAA--TTTT TTTT--AATTTT
///
/// This modifies the array in place.
/// </summary>
/// <param name="refseq">Reference Sequency</param>
/// <param name="query">Query Sequence</param>
/// <returns></returns>
public static void LeftAlignIndels(byte[] refseq, BPandQV[] query)
{
// Validation
if (refseq.Length != query.Length) {
throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences");
}
ValidateNoOverlappingGaps (refseq, query);
byte gap = DnaAlphabet.Instance.Gap;
// Keep left aligning until we can't anymore, this is a
// do while loop because some downstream left alignments open up
// further ones upstream, even though this is rare.
int change_count = 0;
int loopsThrough = 0;
do
{
loopsThrough++;
change_count = 0;
for (int i = 1; i < refseq.Length; i++)
{
if (refseq[i] == gap)
{
int len = GetGapLength(i, refseq);
int left_side = i - 1;
int right_side = i - 1 + len;
while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP))
{
// Move the gap left.
if (right_side < refseq.Length) {
refseq[right_side] = refseq[left_side];
}
refseq[left_side] = gap;
left_side--;
right_side--;
change_count++;
}
if (loopsThrough > MAX_LOOPS) {
throw new Exception(MAX_LOOPS_ERROR);
}
}
else if (query[i].BP == gap)
{
int len = GetGapLength(i, query);
int left_side = i - 1;
int right_side = i - 1 + len;
while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side]))
{
// Move the gap left.
if (right_side < query.Length) {
query[right_side] = query[left_side];
}
query[left_side] = new BPandQV(gap, 0);
left_side--;
right_side--;
change_count++;
}
if (loopsThrough > MAX_LOOPS) {
throw new Exception(MAX_LOOPS_ERROR);
}
}
}
} while (change_count > 0);
}
/// <summary>
/// Given the start position of a gap, returns how long it is.
/// For example:
///
/// AAAA---TTTT returns 3.
/// </summary>
/// <param name="pos">0 indexed</param>
/// <param name="array"></param>
/// <returns></returns>
public static int GetGapLength(int pos, BPandQV[] array)
{
var gap = DnaAlphabet.Instance.Gap;
int len = 1;
while (++pos < array.Length)
{
if (array[pos].BP == gap)
{
len += 1;
}
else
{
break;
}
}
return len;
}
/// <summary>
/// Given the start position of a gap, returns how long it is.
/// For example:
///
/// AAAA---TTTT returns 3.
/// </summary>
/// <param name="pos">0 indexed</param>
/// <param name="array"></param>
/// <returns></returns>
public static int GetGapLength(int pos, byte[] array)
{
var gap = DnaAlphabet.Instance.Gap;
int len = 1;
while (++pos < array.Length)
{
if (array[pos] == gap)
{
len += 1;
}
else
{
break;
}
}
return len;
}
/// <summary>
/// Simple check that the alignment does not have a gap on top of a
/// gap, which violates several assumptions.
/// </summary>
/// <param name="seq1"></param>
/// <param name="seq2"></param>
internal static void ValidateNoOverlappingGaps(byte[] seq1, BPandQV[] seq2)
{
var gap = DnaAlphabet.Instance.Gap;
for(int i=0;i<seq1.Length;i++)
{
if (seq1[i] == gap && seq2[i].BP == gap)
throw new Exception("You have an alignment with overlapping gaps. Input problem!");
}
}
/// <summary>
/// Reverse complements the BP and QV values accounting for homopolymers.
///
/// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
/// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
/// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the
/// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
/// </summary>
/// <returns>The reverse complemented sequence.</returns>
/// <param name="toFlip">The array with the QV values to flip.</param>
/// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param>
internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false)
{
BPandQV[] newData = new BPandQV[toFlip.Length];
for (long index = 0; index < toFlip.Length; index++)
{
byte complementedSymbol;
byte symbol = toFlip[toFlip.Length - index - 1].BP;
if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol))
{
throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString());
}
var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index -1].QV);
newData [index] = bpandq;
}
if (flipHpQvValues) {
ReverseQVValuesForHomopolymers (newData);
}
return newData;
}
/// <summary>
/// Reverses the QV values for homopolymers.
/// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
/// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
/// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the
/// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
/// </summary>
/// <returns>The QV values for homopolymers.</returns>
/// <param name="toFlip">To flip.</param>
internal static void ReverseQVValuesForHomopolymers(BPandQV[] toFlip) {
// Basic idea is to assume it is A, C, G, T alphabet and flip HP values
// Also assumes all low QV is due to HP deletion/insertion error.
if (toFlip.Length > 1) {
byte lastbp = toFlip [0].BP;
int firstPos = 0;
int curLength = 1;
for (int i = 1; i < toFlip.Length; i++) {
byte newbp = toFlip [i].BP;
if (newbp != lastbp) {
if (curLength > 1) {
var right = toFlip [i - 1];
var left = toFlip[firstPos];
Debug.Assert (right.BP == left.BP);
if (right.QV < left.QV) {
toFlip [i - 1] = left;
toFlip [firstPos] = right;
}
}
firstPos = i;
lastbp = newbp;
curLength = 1;
} else if (newbp == lastbp) {
curLength++;
}
}
// Finally flip the end
if (curLength > 1) {
var tmp = toFlip [toFlip.Length - 1];
toFlip [toFlip.Length - 1] = toFlip [firstPos];
toFlip [firstPos] = tmp;
}
}
}
}
}

Просмотреть файл

@ -0,0 +1,43 @@
using System;
namespace Bio.Variant
{
/// <summary>
/// A Basepair and corresponding QV value, used to keep the two values together.
/// </summary>
public struct BPandQV
{
/// <summary>
/// A, C, G, T or '-'
/// </summary>
public readonly byte BP;
/// <summary>
/// A QV Value for the base. Should be set to 0 if no information is available
/// or is a gap character.
/// </summary>
public readonly byte QV;
/// <summary>
/// Initializes a new instance of the <see cref="Bio.Variant.BPandQV"/> struct.
/// </summary>
/// <param name="bp">Bp.</param>
/// <param name="qv">QV value</param>
/// <param name="validate">If set to <c>true</c> validate that the BP is an A, C, G or T</param>
public BPandQV (byte bp, byte qv, bool validate=true)
{
var bp2 = (byte)'-';
if (validate &&
!(bp != 'A' ||
bp != 'C' ||
bp != 'G' ||
bp != 'T' ||
bp != '-')) {
throw new ArgumentOutOfRangeException ("bp", "Basepairs must be A, C, G, T or -");
}
BP = bp;
QV = qv;
}
}
}

Просмотреть файл

@ -0,0 +1,120 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Bio;
namespace Bio.Variant
{
/// <summary>
/// The two types of indels, insertions or deletions.
/// </summary>
[Flags]
public enum IndelType : byte {
/// <summary>
/// An insertion relative to the reference.
/// </summary>
Insertion,
/// <summary>
/// A deletion relative to the reference.
/// </summary>
Deletion}
/// <summary>
/// An Indel Variant.
/// </summary>
public class IndelVariant : Variant
{
/// <summary>
/// Insertion or Deletion
/// </summary>
public readonly IndelType InsertionOrDeletion;
/// <summary>
/// These are the bases in the insertion (or what was deleted).
/// </summary>
public readonly string InsertedOrDeletedBases;
/// <summary>
/// True if the homopolymer length of the reference is > 1.
/// <code>
/// e.g. ACC -> TRUE
/// A-C
/// </code>
/// </summary>
/// <value><c>true</c> if in homo polymer; otherwise, <c>false</c>.</value>
public bool InHomopolymer {
get {
return HomopolymerLengthInReference > 1;
}
}
///
/// At the position of the insertion or deletion, what is the length of the nearest homopolymer run in the reference.
/// For insertions, the HP length is defined by the next base in the reference.
///
/// <code>
/// For example, the insertion below occurs at an HP of char 'C' and length 3.
///
/// ref: AT-CCCTG
/// query: ATCCCCTG
///
/// For multiple insertions or a long deletion, only the HP of the first position is counted,
/// so the example below would be a 2 bp long 'C'
///
///
/// ref: ATCCTGCATA
/// query: AT-----ATA
///
///
/// </code>
///
public int HomopolymerLengthInReference {
get;
set;
}
/// <summary>
/// A, C, G or T the basepair of the homopolymer.
/// </summary>
/// <value>The homopolymer base.</value>
public char HomopolymerBase {
get;
set;
}
/// <summary>
/// Initializes a new instance of the <see cref="Bio.Variant.IndelVariant"/> class.
/// </summary>
/// <param name="position">Position.</param>
/// <param name="length">Length.</param>
/// <param name="bases">Bases.</param>
/// <param name="insertionOrDeletion">Insertion or deletion.</param>
/// <param name="hpBase">Hp base.</param>
/// <param name="hpLengthInReference">Hp length in reference.</param>
/// <param name="atAlignmentEnd">If set to <c>true</c> at alignment end.</param>
public IndelVariant(int position, int length, string bases, IndelType insertionOrDeletion, char hpBase, int hpLengthInReference, bool atAlignmentEnd = false)
: base (position, atAlignmentEnd)
{
Type = VariantType.INDEL;
InsertionOrDeletion = insertionOrDeletion;
Length = length;
InsertedOrDeletedBases = bases;
HomopolymerBase = hpBase;
HomopolymerLengthInReference = hpLengthInReference;
}
/// <summary>
/// Returns a <see cref="System.String"/> that represents the current <see cref="Bio.Variant.IndelVariant"/>.
/// </summary>
/// <returns>A <see cref="System.String"/> that represents the current <see cref="Bio.Variant.IndelVariant"/>.</returns>
public override string ToString ()
{
var insert = this.InsertionOrDeletion == IndelType.Deletion ? "Deletion" : "Insertion";
return string.Format ("[IndelVariant: InHomopolymer={0}, HomopolymerLengthInReference={1}, HomopolymerBase={2}, Type={3}]", InHomopolymer, HomopolymerLengthInReference, HomopolymerBase, insert);
}
}
}

Просмотреть файл

@ -0,0 +1,51 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Diagnostics;
using Bio;
namespace Bio.Variant
{
/// <summary>
/// A class to store information about a SNP variant.
/// </summary>
public class SNPVariant : Variant
{
/// <summary>
/// The BP in the reference at this position.
/// </summary>
public readonly char RefBP;
/// <summary>
/// The variant present at this position.
/// </summary>
public readonly char AltBP;
/// <summary>
/// Create a new SNP in the given reference at the given position.
/// </summary>
/// <param name="position">0-based position on reference.</param>
/// <param name="altAllele">The variant present (A, C, G, T)</param>
/// <param name="refBP">The Reference sequence.</param>
public SNPVariant(int position, char altAllele, char refBP) :
base (position)
{
AltBP = altAllele;
Type = VariantType.SNP;
RefBP = (char) refBP;
Length = 1;
}
/// <summary>
/// Returns a <see cref="System.String"/> that represents the current <see cref="Bio.Variant.SNPVariant"/>.
/// </summary>
/// <returns>A <see cref="System.String"/> that represents the current <see cref="Bio.Variant.SNPVariant"/>.</returns>
public override string ToString ()
{
return RefBP + "->" + AltBP + " @ " + StartPosition;
}
}
}

Просмотреть файл

@ -0,0 +1,179 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Bio;
namespace Bio.Variant
{
/// <summary>
/// Top level class for holding variant information.
/// This is implemented in two sub classes, SNPVariant and IndelVariant.
/// </summary>
public abstract class Variant : IComparable<Variant>, IEquatable<Variant>
{
/// <summary>
/// The name of the reference sequence the variant was called from.
/// </summary>
public string RefName;
/// <summary>
/// SNP, indel, or complex.
/// </summary>
public VariantType Type { get; protected set; }
/// <summary>
/// 0-based start position of variant.
/// <code>
/// For Indels, this is the left most position
/// BEFORE the event. e.g.
/// <code>
/// AAATTTAAA -> is a deletion of length 3 starting at position 2.
/// AAA---AAA
///
/// A--TTAAA -> is an insertion of length 2 starting at position 0.
/// ACCTTAAA
///
/// ACCTTAAA -> is a deletion at position 3.
/// ACC-TAAA
/// </code>
///
/// </summary>
public int StartPosition { get; set; }
/// <summary>
/// O-based end index (same as start for SNPs).
/// A SNP is of length 1, a deletion of length 2 is 2, etc.
/// </summary>
public int Length { get; protected set; }
/// <summary>
/// The position in the alignment where this variant ends, exclusive.
/// For a SNP, this is the position AFTER the SNP. Same for indels.
///
/// <code>
/// AACCTT -> End Position = 3
/// AAGCTT
/// </code>
///
/// </summary>
public int EndPosition
{
get { return StartPosition + Length; }
}
/// <summary>
/// Is the variant at the very start or end of an alignment?
/// (that is it was called based on the first or last base seen on
/// either sequence in the alignment.)
/// These can have certain pathologies so we take note and keep an eye on them.
/// They should almost always be excluded by the alignment algorithm clipping at the ends.
/// </summary>
public bool AtEndOfAlignment { get; protected set; }
/// <summary>
/// The QV value for this call, if it exists. If not, this is set to 0.
/// </summary>
/// <value>The QV value.</value>
public byte QV {get; set;}
/// <summary>
/// Initializes a new instance of the <see cref="Bio.Variant.Variant"/> class.
/// </summary>
/// <param name="position">Position.</param>
/// <param name="atAlignmentEnd">If set to <c>true</c> at alignment end.</param>
public Variant(int position, bool atAlignmentEnd = false)
{
StartPosition = position;
AtEndOfAlignment = atAlignmentEnd;
}
/// <summary>
/// Serves as a hash function for a <see cref="Bio.Variant.Variant"/> object.
/// </summary>
/// <returns>A hash code for this instance that is suitable for use in hashing algorithms and data structures such as a
/// hash table.</returns>
public override int GetHashCode ()
{
//TODO: This is a little unsafe as these fields are mutable...
var h1 = RefName != null ? RefName.GetHashCode () : 0;
var h2 = StartPosition.GetHashCode ();
var h3 = EndPosition.GetHashCode ();
return h1 ^ h2 ^ h3;
}
/// <summary>
/// Determines whether the specified <see cref="System.Object"/> is equal to the current <see cref="Bio.Variant.Variant"/>.
///
/// Two variants are "equal" if they have the same start and end positions and are of the same type.
/// </summary>
/// <param name="obj">The <see cref="System.Object"/> to compare with the current <see cref="Bio.Variant.Variant"/>.</param>
/// <returns><c>true</c> if the specified <see cref="System.Object"/> is equal to the current
/// <see cref="Bio.Variant.Variant"/>; otherwise, <c>false</c>.</returns>
public override bool Equals (object obj)
{
var res = obj as Variant;
if (res != null)
return this.Equals (res);
return false;
}
#region IComparable implementation
/// <Docs>To be added.</Docs>
/// <para>Returns the sort order (Reference Name, Start Position) of the current variant compared to the specified object.</para>
/// <summary>
///
/// </summary>
/// <returns>The to.</returns>
/// <param name="other">Other.</param>
public int CompareTo (Variant other)
{
var nameComp = String.CompareOrdinal(RefName,other.RefName);
if (nameComp == 0) {
return StartPosition.CompareTo (other.StartPosition);
}
return nameComp;
}
#endregion
#region IEquatable implementation
/// <summary>
/// Determines whether the specified <see cref="Bio.Variant.Variant"/> is equal to the current <see cref="Bio.Variant.Variant"/>.
///
/// Two variants are equal if they have the same reference, start/end and type.
/// </summary>
/// <param name="other">The <see cref="Bio.Variant.Variant"/> to compare with the current <see cref="Bio.Variant.Variant"/>.</param>
/// <returns><c>true</c> if the specified <see cref="Bio.Variant.Variant"/> is equal to the current
/// <see cref="Bio.Variant.Variant"/>; otherwise, <c>false</c>.</returns>
bool IEquatable<Variant>.Equals (Variant other)
{
var otherComp = CompareTo (other);
if (otherComp == 0) {
return this.StartPosition == other.StartPosition && this.Type == other.Type && this.EndPosition == other.EndPosition;
}
return false;
}
#endregion
}
/// <summary>
/// The type of variant, either SNP, INDEL or Complex.
/// </summary>
public enum VariantType
{
/// <summary>
/// A SNP variant.
/// </summary>
SNP,
/// <summary>
/// And Indel variant.
/// </summary>
INDEL,
/// <summary>
/// A Complex variant. Not currently used but a place holder for more complex scenarios in the future.
/// </summary>
Complex
}
}

Просмотреть файл

@ -0,0 +1,271 @@
using System;
using System.Linq;
using System.Collections.Generic;
using System.Diagnostics;
using Bio;
using Bio.Extensions;
using Bio.Algorithms.Alignment;
namespace Bio.Variant
{
/// <summary>
/// This class takes alignments
/// and generates a list of variants.
/// </summary>
public static class VariantCaller
{
/// <summary>
/// Given two byte arrays representing a pairwise alignment, shift them so
/// that all deletions start as early as possible. For example:
///
/// <code>
/// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT
/// TTTTAA--TTTT TTTT--AATTTT
/// </code>
///
/// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second
/// sequence is the query. It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants.
/// </summary>
/// <param name="aln">Aln. The second sequence should be of type QualitativeSequence or Sequence</param>
/// <param name="callVariants">callVariants. If true, it will call variants, otherwise the second half of tuple will be null. </param>
public static Tuple<IPairwiseSequenceAlignment, List<Variant>> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) {
if (aln == null) {
throw new NullReferenceException ("aln");
}
if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) {
throw new ArgumentException ("The pairwise aligned sequence should only have one alignment");
}
var frstAln = aln.PairwiseAlignedSequences.First ();
var seq1 = frstAln.FirstSequence;
var seq2 = frstAln.SecondSequence;
if (seq1 == null) {
throw new NullReferenceException ("seq1");
} else if (seq2 == null) {
throw new NullReferenceException ("seq2");
}
//TODO: Might implement an ambiguity check later.
#if FALSE
if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) {
throw new ArgumentException ("Cannot left align sequences with ambiguous symbols.");
}
#endif
// Note we have to copy unless we can guarantee the array will not be mutated.
byte[] refseq = seq1.ToArray ();
ISequence newQuery;
List<Variant> variants = null;
// Call variants for a qualitative sequence
if (seq2 is QualitativeSequence) {
var qs = seq2 as QualitativeSequence;
var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray ();
AlignmentUtils.LeftAlignIndels (refseq, query);
AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
if (callVariants) {
variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
}
var newQueryQS = new QualitativeSequence (qs.Alphabet,
qs.FormatType,
query.Select (z => z.BP).ToArray (),
query.Select (p => p.QV).ToArray (),
false);
newQueryQS.Metadata = seq2.Metadata;
newQuery = newQueryQS;
} else if (seq2 is Sequence) { // For a sequence with no QV values.
var qs = seq2 as Sequence;
var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray();
AlignmentUtils.LeftAlignIndels (refseq, query);
AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
// ISequence does not have a setable metadata
var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false);
newQueryS.Metadata = seq2.Metadata;
if (callVariants) {
variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
}
newQuery = newQueryS;
} else {
throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence.");
}
if (aln.FirstSequence != null && aln.FirstSequence.ID != null) {
foreach (var v in variants) {
v.RefName = aln.FirstSequence.ID;
}
}
var newRef = new Sequence (seq1.Alphabet, refseq, false);
newRef.ID = seq1.ID;
newRef.Metadata = seq1.Metadata;
newQuery.ID = seq2.ID;
var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence);
var pas = new PairwiseAlignedSequence ();
pas.FirstSequence = newRef;
pas.SecondSequence = newQuery;
newaln.Add (pas);
return new Tuple<IPairwiseSequenceAlignment, List<Variant>> (newaln, variants);
}
/// <summary>
/// Given a pairwise sequence alignment, call variants, producing
/// a list of SNPs and Indels found in the alignment.
///
/// This method will first left-align all variants before calling to be consistent with other
/// software. The
/// </summary>
/// <param name="aln">The Pairwise alignment to call variants with.</param>
/// <returns></returns>
///
public static List<Variant> CallVariants(IPairwiseSequenceAlignment aln) {
return LeftAlignIndelsAndCallVariants (aln).Item2;
}
/// <summary>
/// Calls the variants.
///
/// Should only be used internally as assumptions are made that the alignments are left-aligned and fulfill certain criteria.
/// </summary>
/// <returns>The variants.</returns>
/// <param name="refSeq">Reference seq.</param>
/// <param name="querySeq">Query seq.</param>
/// <param name="originallyReverseComplemented">If set to <c>true</c> the query sequence was originally reverse complemented. (this affects QV value scoring)</param>
internal static List<Variant> CallVariants(byte[] refSeq, BPandQV[] querySeq, bool originallyReverseComplemented)
{
if (originallyReverseComplemented) {
AlignmentUtils.ReverseQVValuesForHomopolymers (querySeq);
}
List<Variant> variants = new List<Variant>();
// Now call variants.
var gap = DnaAlphabet.Instance.Gap;
int i = 0;
int refPos = 0;
int queryPos = 0;
while( i < refSeq.Length)
{
if (refSeq[i] == gap)
{
int len = AlignmentUtils.GetGapLength(i, refSeq);
var nextBasePos = (i + len);
// Should alway be true as we don't end in gaps
Debug.Assert (nextBasePos < refSeq.Length);
var hplenAndChar = determineHomoPolymerLength (nextBasePos, refSeq);
var bases = getBases(querySeq, i, len);
var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Insertion,
hplenAndChar.Item2, hplenAndChar.Item1,
(i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
newVariant.QV = querySeq[queryPos].QV;
variants.Add(newVariant);
i += len;
queryPos += len;
}
else if (querySeq[i].BP == gap)
{
int len = AlignmentUtils.GetGapLength(i, querySeq);
var bases = getBases(refSeq, i, len);
var hplenAndChar = determineHomoPolymerLength (i, refSeq);
var newVariant = new IndelVariant(refPos - 1, len, bases,
IndelType.Deletion, hplenAndChar.Item2,
hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
/* An insertion mutation occurs BEFORE pos, so normally we get the next base
* or the last one if it's a reverse complemented alignment. However, this is not true if
* it is a homopolymer because what would have been the previous position is the next position
* after left aligning and reversing the position of the QV value.
*
* Consider the following
* --*- -*--
* A-TA --> TA-T
* AGTA TACT
*
* However,
* --*-- --*--
* A-TTA ----> T-AAT
* ATTTA TAAAT
*
*/
if ((i + len ) < querySeq.Length) {
var qc_pos = originallyReverseComplemented ? i - 1 : i + len;
if (newVariant.InHomopolymer) {
qc_pos = i + len;
}
newVariant.QV = querySeq[qc_pos].QV;
}
variants.Add(newVariant);
i += len;
refPos += len;
}
else
{
if (querySeq[i].BP != refSeq[i])
{
var newVariant = new SNPVariant(refPos, (char) querySeq[i].BP, (char)refSeq[i]);
newVariant.QV = querySeq [queryPos].QV;
variants.Add(newVariant);
}
i++; refPos++; queryPos++;
}
}
return variants;
}
/// <summary>
/// Converts a subset of bases in the array into a string.
/// </summary>
/// <param name="array"></param>
/// <param name="position"></param>
/// <param name="length"></param>
/// <returns></returns>
private static string getBases(byte[] array, int position, int length)
{
char[] chars = new char[length];
for(int i=0; i<length; i++)
{
chars[i] = (char)array[i+position];
}
return new string(chars);
}
/// <summary>
/// Gets the bases for a length of the sequence as a string.
/// </summary>
/// <returns>The bases.</returns>
/// <param name="array">Array.</param>
/// <param name="position">Position.</param>
/// <param name="length">Length.</param>
private static string getBases(BPandQV[] array, int position, int length)
{
char[] chars = new char[length];
for(int i=0; i<length; i++)
{
chars[i] = (char)array[i+position].BP;
}
return new string(chars);
}
/// <summary>
/// Returns the length of the homopolymer
/// </summary>
/// <returns>The homopolymer length.</returns>
private static Tuple<int, char> determineHomoPolymerLength(int pos, byte[] refSeq)
{
byte start_bp = refSeq[pos];
int length = 1;
while ( ++pos < refSeq.Length &&
refSeq [pos] == start_bp) {
length++;
}
var homopolymerBase = (char)start_bp;
return new Tuple<int,char> (length, homopolymerBase);
}
}
}

Просмотреть файл

@ -218,6 +218,7 @@
<Compile Include="Algorithms\Padena\PadenaBvtTestCases.cs" />
<Compile Include="Algorithms\Padena\PadenaP1TestCases.cs" />
<Compile Include="PacBio\ParserTests.cs" />
<Compile Include="Variant\VariantCallTests.cs" />
</ItemGroup>
<ItemGroup>
<Content Include="..\TestData\Testcases.xml">
@ -269,5 +270,6 @@
<ItemGroup />
<ItemGroup>
<Folder Include="PacBio\" />
<Folder Include="Variant\" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,307 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Bio;
using Bio.Variant;
using Bio.Algorithms.Alignment;
using Bio.Extensions;
using NUnit.Framework;
namespace Bio.Tests
{
[TestFixture]
[Category("VariantCalls")]
public static class VariantCallTests
{
/// <summary>
/// Converts the Sequence to a QualitativeSequence in the alignment.
/// </summary>
/// <param name="aln">Aln.</param>
/// <param name="qualScores">Qual scores.</param>
public static void ConvertAlignedSequenceToQualSeq(IPairwiseSequenceAlignment aln, int[] qualScores) {
var q = aln.PairwiseAlignedSequences [0].SecondSequence as Sequence;
var qvs = new int[q.Count];
int queryPos = 0;
for (int i = 0; i < qvs.Length; i++) {
if (q [i] == '-') {
qvs [i] = 0;
} else {
qvs [i] = qualScores[queryPos++];
}
}
var qseq = new QualitativeSequence (DnaAlphabet.Instance, FastQFormatType.Sanger, q.ToArray (), qvs, false);
aln.PairwiseAlignedSequences [0].SecondSequence = qseq;
}
[Test]
public static void Test1BPInsertionCall()
{
string seq1seq = "ATA-CCCTT".Replace("-", String.Empty);
string seq2seq = "ATACCCCTT";
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (4, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (3, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion);
}
[Test]
public static void Test1BPDeletionCall()
{
string seq1seq = "ATACCCCTT";
string seq2seq = "ATA-CCCTT".Replace("-", String.Empty);
int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (2, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion);
}
[Test]
public static void TestSNPCall()
{
string seq1seq = "ATCCCCCTT";
string seq2seq = "ATCCCTCTT";
int[] seq2qual = new int[] { 30, 30, 30, 30, 5, 3, 30, 30, 30 };
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
ConvertAlignedSequenceToQualSeq (aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (3, variant.QV);
Assert.AreEqual (5, variant.StartPosition);
Assert.AreEqual (variant.Type, VariantType.SNP);
var vi = variant as SNPVariant;
Assert.AreEqual (1, vi.Length);
Assert.AreEqual ('T', vi.AltBP);
Assert.AreEqual ('C', vi.RefBP);
Assert.AreEqual (VariantType.SNP, vi.Type);
Assert.AreEqual (false, vi.AtEndOfAlignment);
}
[Test]
public static void TestLeftAlignmentStep() {
var refseq = "ACAATAAAAGCGCGCGCGCGTTACGTATAT--ATGGATAT";
var queryseq = "ACAATAA-AGC--GCGC--GTTACGTATATATATGGATAT";
var r = new Sequence (DnaAlphabet.Instance, refseq);
var q = new Sequence (DnaAlphabet.Instance, queryseq);
var aln = new PairwiseSequenceAlignment (r, q);
var pas = new PairwiseAlignedSequence ();
pas.FirstSequence = r;
pas.SecondSequence = q;
aln.Add (pas);
var tpl = VariantCaller.LeftAlignIndelsAndCallVariants (aln, true);
// Check the left alignment
aln = tpl.Item1 as PairwiseSequenceAlignment;
var lar = aln.PairwiseAlignedSequences [0].FirstSequence.ConvertToString();
var laq = aln.PairwiseAlignedSequences [0].SecondSequence.ConvertToString();
var exprefseq = "ACAATAAAAGCGCGCGCGCGTTACG--TATATATGGATAT";
var expqueryseq = "ACAAT-AAA----GCGCGCGTTACGTATATATATGGATAT";
Assert.AreEqual (exprefseq, lar);
Assert.AreEqual (expqueryseq, laq);
// And it's hard, so we might as well check the variants
var variants = tpl.Item2;
Assert.AreEqual (3, variants.Count);
string[] bases = new string[] { "A", "GCGC", "TA" };
char[] hpbases = new char[] { 'A', 'G', 'T' };
bool[] inHp = new bool[] { true, false, false };
int[] lengths = new int[] { 1, 4, 2 };
int[] starts = new int[] { 4, 8, 24 };
IndelType[] types = new IndelType[] { IndelType.Deletion, IndelType.Deletion, IndelType.Insertion };
for (int i = 0; i < 3; i++) {
Assert.AreEqual (VariantType.INDEL, variants [i].Type);
var vi = variants [i] as IndelVariant;
Assert.AreEqual (hpbases[i], vi.HomopolymerBase);
Assert.AreEqual (starts [i], vi.StartPosition);
Assert.AreEqual (lengths [i], vi.Length);
Assert.AreEqual (bases [i], vi.InsertedOrDeletedBases);
Assert.AreEqual (inHp [i], vi.InHomopolymer);
Assert.AreEqual (types [i], vi.InsertionOrDeletion);
}
}
[Test]
public static void TestReverseComplement1BPIndelCall() {
string seq1seq = "ATACCCCTTGCGC";
string seq2seq = "ATA-CCCTTGCGC".Replace("-", String.Empty);
int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30, 30, 30, 30, 30 };
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
var s1rc = refseq.GetReverseComplementedSequence ();
var s2rc = query.GetReverseComplementedSequence ();
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (s1rc, s2rc).First();
VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ());
aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement ();
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (2, variant.QV);
Assert.AreEqual (5, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual (IndelType.Deletion, vi.InsertionOrDeletion);
Assert.AreEqual ('G', vi.HomopolymerBase);
Assert.AreEqual (1, vi.Length);
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual ("G", vi.InsertedOrDeletedBases);
Assert.AreEqual (false, vi.AtEndOfAlignment);
Assert.AreEqual (6, vi.EndPosition);
}
[Test]
public static void TestTrickyQVInversions() {
// This will be hard because normally flip the QV value for a homopolymer, but in this case we won't.
// Note the whole notion of flipping is poorly defined.
string seq1seq = "ATTGC";
string seq2seq = "ATAGC";
int[] seq2qual = new int[] { 30, 30, 2, 30, 30 };
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
var s1rc = refseq.GetReverseComplementedSequence ();
var s2rc = query.GetReverseComplementedSequence ();
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (s1rc, s2rc).First();
VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ());
aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement ();
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (1, variants.Count);
var variant = variants.First ();
Assert.AreEqual (VariantType.SNP, variant.Type);
Assert.AreEqual (2, variant.QV);
var vs = variant as SNPVariant;
Assert.AreEqual ('T', vs.AltBP);
Assert.AreEqual ('A', vs.RefBP);
}
[Test]
public static void TestInsertionAtEndofHP() {
string seq1seq = "ATA-CCC".Replace("-", String.Empty);
string seq2seq = "ATACCCC";
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (4, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (true, vi.AtEndOfAlignment);
Assert.AreEqual (3, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion);
}
[Test]
public static void TestDeletionAtEndofHP() {
string seq1seq = "ATACCCC";
string seq2seq = "ATA-CCC".Replace("-", String.Empty);
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (4, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (true, vi.AtEndOfAlignment);
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion);
}
[Test]
public static void TestExceptionThrownForUnclippedAlignment() {
var refseq = "ACAATATA";
var queryseq = "ACAATAT-";
var r = new Sequence (DnaAlphabet.Instance, refseq);
var q = new Sequence (DnaAlphabet.Instance, queryseq);
var aln = new PairwiseSequenceAlignment (r, q);
var pas = new PairwiseAlignedSequence ();
pas.FirstSequence = r;
pas.SecondSequence = q;
aln.Add (pas);
Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true));
refseq = "AAACAATATA";
queryseq = "AA-CAATATA";
r = new Sequence (DnaAlphabet.Instance, refseq);
q = new Sequence (DnaAlphabet.Instance, queryseq);
aln = new PairwiseSequenceAlignment (r, q);
pas = new PairwiseAlignedSequence ();
pas.FirstSequence = r;
pas.SecondSequence = q;
aln.Add (pas);
Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true));
}
}
}