From 3adad2773cd8d9361498a24e1aaf81382274675b Mon Sep 17 00:00:00 2001 From: Nigel Delaney Date: Tue, 4 Aug 2015 19:21:51 -0700 Subject: [PATCH] Add Bio.Variant Namespace This collection of code allows users to call variants with QV values from IPairwiseAlignedSequence objects that contain either regular sequences or sequences with associated quality values. Examples of how to use it are provided in the tests shown. --- src/Source/Framework/Bio.Core/Bio.Core.csproj | 7 + .../Bio.Core/Variant/AlignmentUtils.cs | 265 +++++++++++++++ .../Framework/Bio.Core/Variant/BPandQV.cs | 43 +++ .../Bio.Core/Variant/IndelVariant.cs | 120 +++++++ .../Framework/Bio.Core/Variant/SNPVariant.cs | 51 +++ .../Framework/Bio.Core/Variant/Variant.cs | 179 ++++++++++ .../Bio.Core/Variant/VariantCaller.cs | 271 ++++++++++++++++ .../Tests/Bio.Tests/Bio.Tests.Core.csproj | 2 + .../Bio.Tests/Variant/VariantCallTests.cs | 307 ++++++++++++++++++ 9 files changed, 1245 insertions(+) create mode 100644 src/Source/Framework/Bio.Core/Variant/AlignmentUtils.cs create mode 100644 src/Source/Framework/Bio.Core/Variant/BPandQV.cs create mode 100644 src/Source/Framework/Bio.Core/Variant/IndelVariant.cs create mode 100644 src/Source/Framework/Bio.Core/Variant/SNPVariant.cs create mode 100644 src/Source/Framework/Bio.Core/Variant/Variant.cs create mode 100644 src/Source/Framework/Bio.Core/Variant/VariantCaller.cs create mode 100644 src/Source/Tests/Bio.Tests/Variant/VariantCallTests.cs diff --git a/src/Source/Framework/Bio.Core/Bio.Core.csproj b/src/Source/Framework/Bio.Core/Bio.Core.csproj index a9c30af..ba647b7 100644 --- a/src/Source/Framework/Bio.Core/Bio.Core.csproj +++ b/src/Source/Framework/Bio.Core/Bio.Core.csproj @@ -433,6 +433,12 @@ + + + + + + @@ -478,5 +484,6 @@ cp "$(TargetDir)\$(TargetName).xml" "$(SolutionDir)\bin\$(TargetName).xml" + \ No newline at end of file diff --git a/src/Source/Framework/Bio.Core/Variant/AlignmentUtils.cs b/src/Source/Framework/Bio.Core/Variant/AlignmentUtils.cs new file mode 100644 index 0000000..b4ea687 --- /dev/null +++ b/src/Source/Framework/Bio.Core/Variant/AlignmentUtils.cs @@ -0,0 +1,265 @@ +using System; +using Bio.Algorithms.Alignment; +using System.Collections.Generic; +using System.Linq; +using Bio.Extensions; +using System.Diagnostics; + +namespace Bio.Variant +{ + /// + /// Utilities to manipulate and work with alignments when calling variants. + /// + public static class AlignmentUtils + { + + /// + /// The maximum number of times we can shift an indel before we fail. + /// It should never go anywhere near this number and this is only implemented as a guard. + /// + private const int MAX_LOOPS = 50; + private const string MAX_LOOPS_ERROR = "The Left-align indel step did not converge within 50 iterations"; + + /// + /// Verifies the alignment has no leading or trailing gaps and throws an exception otherwise. + /// + /// Refseq. + /// Query. + internal static void VerifyNoGapsOnEnds(byte[] refseq, BPandQV[] query) { + var gap = DnaAlphabet.Instance.Gap; + if (refseq [0] == gap || + refseq [refseq.Length - 1] == gap || + query [0].BP == gap || + query [query.Length - 1].BP == gap) { + throw new FormatException ("Alignment query and/or reference started with a gap character. " + + "Alignments must be hard-clipped to remove starting and trailing variants " + + "before variants can be called"); + } + } + + /// + /// Given two byte arrays representing a pairwise alignment, shift them so + /// that all deletions start as early as possible. For example: + /// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT + /// TTTTAA--TTTT TTTT--AATTTT + /// + /// This modifies the array in place. + /// + /// Reference Sequency + /// Query Sequence + /// + public static void LeftAlignIndels(byte[] refseq, BPandQV[] query) + { + // Validation + if (refseq.Length != query.Length) { + throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences"); + } + + ValidateNoOverlappingGaps (refseq, query); + byte gap = DnaAlphabet.Instance.Gap; + // Keep left aligning until we can't anymore, this is a + // do while loop because some downstream left alignments open up + // further ones upstream, even though this is rare. + int change_count = 0; + int loopsThrough = 0; + do + { + loopsThrough++; + change_count = 0; + for (int i = 1; i < refseq.Length; i++) + { + if (refseq[i] == gap) + { + int len = GetGapLength(i, refseq); + int left_side = i - 1; + int right_side = i - 1 + len; + while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP)) + { + // Move the gap left. + if (right_side < refseq.Length) { + refseq[right_side] = refseq[left_side]; + } + refseq[left_side] = gap; + left_side--; + right_side--; + change_count++; + } + if (loopsThrough > MAX_LOOPS) { + throw new Exception(MAX_LOOPS_ERROR); + } + } + else if (query[i].BP == gap) + { + int len = GetGapLength(i, query); + int left_side = i - 1; + int right_side = i - 1 + len; + while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side])) + { + // Move the gap left. + if (right_side < query.Length) { + query[right_side] = query[left_side]; + } + query[left_side] = new BPandQV(gap, 0); + left_side--; + right_side--; + change_count++; + } + if (loopsThrough > MAX_LOOPS) { + throw new Exception(MAX_LOOPS_ERROR); + } + } + } + } while (change_count > 0); + } + + /// + /// Given the start position of a gap, returns how long it is. + /// For example: + /// + /// AAAA---TTTT returns 3. + /// + /// 0 indexed + /// + /// + public static int GetGapLength(int pos, BPandQV[] array) + { + var gap = DnaAlphabet.Instance.Gap; + int len = 1; + while (++pos < array.Length) + { + if (array[pos].BP == gap) + { + len += 1; + } + else + { + break; + } + } + return len; + } + + /// + /// Given the start position of a gap, returns how long it is. + /// For example: + /// + /// AAAA---TTTT returns 3. + /// + /// 0 indexed + /// + /// + public static int GetGapLength(int pos, byte[] array) + { + var gap = DnaAlphabet.Instance.Gap; + int len = 1; + while (++pos < array.Length) + { + if (array[pos] == gap) + { + len += 1; + } + else + { + break; + } + } + return len; + } + + + /// + /// Simple check that the alignment does not have a gap on top of a + /// gap, which violates several assumptions. + /// + /// + /// + internal static void ValidateNoOverlappingGaps(byte[] seq1, BPandQV[] seq2) + { + var gap = DnaAlphabet.Instance.Gap; + for(int i=0;i + /// Reverse complements the BP and QV values accounting for homopolymers. + /// + /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that + /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the + /// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the + /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base. + /// + /// The reverse complemented sequence. + /// The array with the QV values to flip. + /// If set to true flip hp qv values. + internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false) + { + BPandQV[] newData = new BPandQV[toFlip.Length]; + + for (long index = 0; index < toFlip.Length; index++) + { + byte complementedSymbol; + byte symbol = toFlip[toFlip.Length - index - 1].BP; + + if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol)) + { + throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString()); + } + var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index -1].QV); + newData [index] = bpandq; + } + + if (flipHpQvValues) { + ReverseQVValuesForHomopolymers (newData); + } + return newData; + } + + /// + /// Reverses the QV values for homopolymers. + /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that + /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the + /// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the + /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base. + /// + /// The QV values for homopolymers. + /// To flip. + internal static void ReverseQVValuesForHomopolymers(BPandQV[] toFlip) { + // Basic idea is to assume it is A, C, G, T alphabet and flip HP values + // Also assumes all low QV is due to HP deletion/insertion error. + if (toFlip.Length > 1) { + byte lastbp = toFlip [0].BP; + int firstPos = 0; + int curLength = 1; + for (int i = 1; i < toFlip.Length; i++) { + byte newbp = toFlip [i].BP; + if (newbp != lastbp) { + if (curLength > 1) { + var right = toFlip [i - 1]; + var left = toFlip[firstPos]; + Debug.Assert (right.BP == left.BP); + if (right.QV < left.QV) { + toFlip [i - 1] = left; + toFlip [firstPos] = right; + } + } + firstPos = i; + lastbp = newbp; + curLength = 1; + } else if (newbp == lastbp) { + curLength++; + } + } + // Finally flip the end + if (curLength > 1) { + var tmp = toFlip [toFlip.Length - 1]; + toFlip [toFlip.Length - 1] = toFlip [firstPos]; + toFlip [firstPos] = tmp; + } + } + } + } +} \ No newline at end of file diff --git a/src/Source/Framework/Bio.Core/Variant/BPandQV.cs b/src/Source/Framework/Bio.Core/Variant/BPandQV.cs new file mode 100644 index 0000000..d580132 --- /dev/null +++ b/src/Source/Framework/Bio.Core/Variant/BPandQV.cs @@ -0,0 +1,43 @@ +using System; + +namespace Bio.Variant +{ + /// + /// A Basepair and corresponding QV value, used to keep the two values together. + /// + public struct BPandQV + { + /// + /// A, C, G, T or '-' + /// + public readonly byte BP; + + /// + /// A QV Value for the base. Should be set to 0 if no information is available + /// or is a gap character. + /// + public readonly byte QV; + + /// + /// Initializes a new instance of the struct. + /// + /// Bp. + /// QV value + /// If set to true validate that the BP is an A, C, G or T + public BPandQV (byte bp, byte qv, bool validate=true) + { + var bp2 = (byte)'-'; + if (validate && + !(bp != 'A' || + bp != 'C' || + bp != 'G' || + bp != 'T' || + bp != '-')) { + throw new ArgumentOutOfRangeException ("bp", "Basepairs must be A, C, G, T or -"); + } + BP = bp; + QV = qv; + } + } +} + diff --git a/src/Source/Framework/Bio.Core/Variant/IndelVariant.cs b/src/Source/Framework/Bio.Core/Variant/IndelVariant.cs new file mode 100644 index 0000000..0755325 --- /dev/null +++ b/src/Source/Framework/Bio.Core/Variant/IndelVariant.cs @@ -0,0 +1,120 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Bio; + +namespace Bio.Variant +{ + /// + /// The two types of indels, insertions or deletions. + /// + [Flags] + public enum IndelType : byte { + /// + /// An insertion relative to the reference. + /// + Insertion, + /// + /// A deletion relative to the reference. + /// + Deletion} + + + /// + /// An Indel Variant. + /// + public class IndelVariant : Variant + { + /// + /// Insertion or Deletion + /// + public readonly IndelType InsertionOrDeletion; + + /// + /// These are the bases in the insertion (or what was deleted). + /// + public readonly string InsertedOrDeletedBases; + + /// + /// True if the homopolymer length of the reference is > 1. + /// + /// e.g. ACC -> TRUE + /// A-C + /// + /// + /// true if in homo polymer; otherwise, false. + public bool InHomopolymer { + get { + + return HomopolymerLengthInReference > 1; + } + } + + /// + /// At the position of the insertion or deletion, what is the length of the nearest homopolymer run in the reference. + /// For insertions, the HP length is defined by the next base in the reference. + /// + /// + /// For example, the insertion below occurs at an HP of char 'C' and length 3. + /// + /// ref: AT-CCCTG + /// query: ATCCCCTG + /// + /// For multiple insertions or a long deletion, only the HP of the first position is counted, + /// so the example below would be a 2 bp long 'C' + /// + /// + /// ref: ATCCTGCATA + /// query: AT-----ATA + /// + /// + /// + /// + public int HomopolymerLengthInReference { + get; + set; + } + + /// + /// A, C, G or T the basepair of the homopolymer. + /// + /// The homopolymer base. + public char HomopolymerBase { + get; + set; + } + + /// + /// Initializes a new instance of the class. + /// + /// Position. + /// Length. + /// Bases. + /// Insertion or deletion. + /// Hp base. + /// Hp length in reference. + /// If set to true at alignment end. + public IndelVariant(int position, int length, string bases, IndelType insertionOrDeletion, char hpBase, int hpLengthInReference, bool atAlignmentEnd = false) + : base (position, atAlignmentEnd) + { + Type = VariantType.INDEL; + InsertionOrDeletion = insertionOrDeletion; + Length = length; + InsertedOrDeletedBases = bases; + HomopolymerBase = hpBase; + HomopolymerLengthInReference = hpLengthInReference; + } + + /// + /// Returns a that represents the current . + /// + /// A that represents the current . + public override string ToString () + { + var insert = this.InsertionOrDeletion == IndelType.Deletion ? "Deletion" : "Insertion"; + return string.Format ("[IndelVariant: InHomopolymer={0}, HomopolymerLengthInReference={1}, HomopolymerBase={2}, Type={3}]", InHomopolymer, HomopolymerLengthInReference, HomopolymerBase, insert); + } + } +} \ No newline at end of file diff --git a/src/Source/Framework/Bio.Core/Variant/SNPVariant.cs b/src/Source/Framework/Bio.Core/Variant/SNPVariant.cs new file mode 100644 index 0000000..7de0ca7 --- /dev/null +++ b/src/Source/Framework/Bio.Core/Variant/SNPVariant.cs @@ -0,0 +1,51 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.Diagnostics; +using Bio; + + + +namespace Bio.Variant +{ + /// + /// A class to store information about a SNP variant. + /// + public class SNPVariant : Variant + { + /// + /// The BP in the reference at this position. + /// + public readonly char RefBP; + + /// + /// The variant present at this position. + /// + public readonly char AltBP; + + /// + /// Create a new SNP in the given reference at the given position. + /// + /// 0-based position on reference. + /// The variant present (A, C, G, T) + /// The Reference sequence. + public SNPVariant(int position, char altAllele, char refBP) : + base (position) + { + AltBP = altAllele; + Type = VariantType.SNP; + RefBP = (char) refBP; + Length = 1; + } + /// + /// Returns a that represents the current . + /// + /// A that represents the current . + public override string ToString () + { + return RefBP + "->" + AltBP + " @ " + StartPosition; + } + } +} diff --git a/src/Source/Framework/Bio.Core/Variant/Variant.cs b/src/Source/Framework/Bio.Core/Variant/Variant.cs new file mode 100644 index 0000000..44e0bd5 --- /dev/null +++ b/src/Source/Framework/Bio.Core/Variant/Variant.cs @@ -0,0 +1,179 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Bio; + +namespace Bio.Variant +{ + /// + /// Top level class for holding variant information. + /// This is implemented in two sub classes, SNPVariant and IndelVariant. + /// + public abstract class Variant : IComparable, IEquatable + { + /// + /// The name of the reference sequence the variant was called from. + /// + public string RefName; + + /// + /// SNP, indel, or complex. + /// + public VariantType Type { get; protected set; } + + /// + /// 0-based start position of variant. + /// + /// For Indels, this is the left most position + /// BEFORE the event. e.g. + /// + /// AAATTTAAA -> is a deletion of length 3 starting at position 2. + /// AAA---AAA + /// + /// A--TTAAA -> is an insertion of length 2 starting at position 0. + /// ACCTTAAA + /// + /// ACCTTAAA -> is a deletion at position 3. + /// ACC-TAAA + /// + /// + /// + public int StartPosition { get; set; } + + /// + /// O-based end index (same as start for SNPs). + /// A SNP is of length 1, a deletion of length 2 is 2, etc. + /// + public int Length { get; protected set; } + + /// + /// The position in the alignment where this variant ends, exclusive. + /// For a SNP, this is the position AFTER the SNP. Same for indels. + /// + /// + /// AACCTT -> End Position = 3 + /// AAGCTT + /// + /// + /// + public int EndPosition + { + get { return StartPosition + Length; } + } + + /// + /// Is the variant at the very start or end of an alignment? + /// (that is it was called based on the first or last base seen on + /// either sequence in the alignment.) + /// These can have certain pathologies so we take note and keep an eye on them. + /// They should almost always be excluded by the alignment algorithm clipping at the ends. + /// + public bool AtEndOfAlignment { get; protected set; } + + /// + /// The QV value for this call, if it exists. If not, this is set to 0. + /// + /// The QV value. + public byte QV {get; set;} + + /// + /// Initializes a new instance of the class. + /// + /// Position. + /// If set to true at alignment end. + public Variant(int position, bool atAlignmentEnd = false) + { + StartPosition = position; + AtEndOfAlignment = atAlignmentEnd; + } + + /// + /// Serves as a hash function for a object. + /// + /// A hash code for this instance that is suitable for use in hashing algorithms and data structures such as a + /// hash table. + public override int GetHashCode () + { + //TODO: This is a little unsafe as these fields are mutable... + var h1 = RefName != null ? RefName.GetHashCode () : 0; + var h2 = StartPosition.GetHashCode (); + var h3 = EndPosition.GetHashCode (); + return h1 ^ h2 ^ h3; + } + /// + /// Determines whether the specified is equal to the current . + /// + /// Two variants are "equal" if they have the same start and end positions and are of the same type. + /// + /// The to compare with the current . + /// true if the specified is equal to the current + /// ; otherwise, false. + public override bool Equals (object obj) + { + var res = obj as Variant; + if (res != null) + return this.Equals (res); + return false; + } + #region IComparable implementation + + /// To be added. + /// Returns the sort order (Reference Name, Start Position) of the current variant compared to the specified object. + /// + /// + /// + /// The to. + /// Other. + public int CompareTo (Variant other) + { + var nameComp = String.CompareOrdinal(RefName,other.RefName); + if (nameComp == 0) { + return StartPosition.CompareTo (other.StartPosition); + } + return nameComp; + } + + #endregion + + #region IEquatable implementation + /// + /// Determines whether the specified is equal to the current . + /// + /// Two variants are equal if they have the same reference, start/end and type. + /// + /// The to compare with the current . + /// true if the specified is equal to the current + /// ; otherwise, false. + bool IEquatable.Equals (Variant other) + { + var otherComp = CompareTo (other); + if (otherComp == 0) { + return this.StartPosition == other.StartPosition && this.Type == other.Type && this.EndPosition == other.EndPosition; + } + return false; + } + + #endregion + } + + /// + /// The type of variant, either SNP, INDEL or Complex. + /// + public enum VariantType + { + /// + /// A SNP variant. + /// + SNP, + /// + /// And Indel variant. + /// + INDEL, + /// + /// A Complex variant. Not currently used but a place holder for more complex scenarios in the future. + /// + Complex + } +} diff --git a/src/Source/Framework/Bio.Core/Variant/VariantCaller.cs b/src/Source/Framework/Bio.Core/Variant/VariantCaller.cs new file mode 100644 index 0000000..14b3eac --- /dev/null +++ b/src/Source/Framework/Bio.Core/Variant/VariantCaller.cs @@ -0,0 +1,271 @@ +using System; +using System.Linq; +using System.Collections.Generic; +using System.Diagnostics; + +using Bio; +using Bio.Extensions; +using Bio.Algorithms.Alignment; + +namespace Bio.Variant +{ + /// + /// This class takes alignments + /// and generates a list of variants. + /// + public static class VariantCaller + { + + /// + /// Given two byte arrays representing a pairwise alignment, shift them so + /// that all deletions start as early as possible. For example: + /// + /// + /// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT + /// TTTTAA--TTTT TTTT--AATTTT + /// + /// + /// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second + /// sequence is the query. It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants. + /// + /// Aln. The second sequence should be of type QualitativeSequence or Sequence + /// callVariants. If true, it will call variants, otherwise the second half of tuple will be null. + public static Tuple> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) { + + if (aln == null) { + throw new NullReferenceException ("aln"); + } + if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) { + throw new ArgumentException ("The pairwise aligned sequence should only have one alignment"); + } + var frstAln = aln.PairwiseAlignedSequences.First (); + var seq1 = frstAln.FirstSequence; + var seq2 = frstAln.SecondSequence; + if (seq1 == null) { + throw new NullReferenceException ("seq1"); + } else if (seq2 == null) { + throw new NullReferenceException ("seq2"); + } + + //TODO: Might implement an ambiguity check later. + #if FALSE + if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) { + throw new ArgumentException ("Cannot left align sequences with ambiguous symbols."); + } + #endif + + // Note we have to copy unless we can guarantee the array will not be mutated. + byte[] refseq = seq1.ToArray (); + ISequence newQuery; + List variants = null; + // Call variants for a qualitative sequence + if (seq2 is QualitativeSequence) { + var qs = seq2 as QualitativeSequence; + var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray (); + AlignmentUtils.LeftAlignIndels (refseq, query); + AlignmentUtils.VerifyNoGapsOnEnds (refseq, query); + if (callVariants) { + variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement()); + } + var newQueryQS = new QualitativeSequence (qs.Alphabet, + qs.FormatType, + query.Select (z => z.BP).ToArray (), + query.Select (p => p.QV).ToArray (), + false); + newQueryQS.Metadata = seq2.Metadata; + newQuery = newQueryQS; + + } else if (seq2 is Sequence) { // For a sequence with no QV values. + var qs = seq2 as Sequence; + var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray(); + AlignmentUtils.LeftAlignIndels (refseq, query); + AlignmentUtils.VerifyNoGapsOnEnds (refseq, query); + // ISequence does not have a setable metadata + var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false); + newQueryS.Metadata = seq2.Metadata; + if (callVariants) { + variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement()); + } + newQuery = newQueryS; + } else { + throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence."); + } + + if (aln.FirstSequence != null && aln.FirstSequence.ID != null) { + foreach (var v in variants) { + v.RefName = aln.FirstSequence.ID; + } + } + + var newRef = new Sequence (seq1.Alphabet, refseq, false); + newRef.ID = seq1.ID; + newRef.Metadata = seq1.Metadata; + + newQuery.ID = seq2.ID; + + var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence); + var pas = new PairwiseAlignedSequence (); + pas.FirstSequence = newRef; + pas.SecondSequence = newQuery; + newaln.Add (pas); + return new Tuple> (newaln, variants); + } + + /// + /// Given a pairwise sequence alignment, call variants, producing + /// a list of SNPs and Indels found in the alignment. + /// + /// This method will first left-align all variants before calling to be consistent with other + /// software. The + /// + /// The Pairwise alignment to call variants with. + /// + /// + + public static List CallVariants(IPairwiseSequenceAlignment aln) { + return LeftAlignIndelsAndCallVariants (aln).Item2; + } + + /// + /// Calls the variants. + /// + /// Should only be used internally as assumptions are made that the alignments are left-aligned and fulfill certain criteria. + /// + /// The variants. + /// Reference seq. + /// Query seq. + /// If set to true the query sequence was originally reverse complemented. (this affects QV value scoring) + internal static List CallVariants(byte[] refSeq, BPandQV[] querySeq, bool originallyReverseComplemented) + { + if (originallyReverseComplemented) { + AlignmentUtils.ReverseQVValuesForHomopolymers (querySeq); + } + List variants = new List(); + + // Now call variants. + var gap = DnaAlphabet.Instance.Gap; + int i = 0; + int refPos = 0; + int queryPos = 0; + while( i < refSeq.Length) + { + if (refSeq[i] == gap) + { + int len = AlignmentUtils.GetGapLength(i, refSeq); + var nextBasePos = (i + len); + // Should alway be true as we don't end in gaps + Debug.Assert (nextBasePos < refSeq.Length); + var hplenAndChar = determineHomoPolymerLength (nextBasePos, refSeq); + var bases = getBases(querySeq, i, len); + var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Insertion, + hplenAndChar.Item2, hplenAndChar.Item1, + (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length)); + newVariant.QV = querySeq[queryPos].QV; + variants.Add(newVariant); + i += len; + queryPos += len; + } + else if (querySeq[i].BP == gap) + { + int len = AlignmentUtils.GetGapLength(i, querySeq); + var bases = getBases(refSeq, i, len); + var hplenAndChar = determineHomoPolymerLength (i, refSeq); + var newVariant = new IndelVariant(refPos - 1, len, bases, + IndelType.Deletion, hplenAndChar.Item2, + hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length)); + /* An insertion mutation occurs BEFORE pos, so normally we get the next base + * or the last one if it's a reverse complemented alignment. However, this is not true if + * it is a homopolymer because what would have been the previous position is the next position + * after left aligning and reversing the position of the QV value. + * + * Consider the following + * --*- -*-- + * A-TA --> TA-T + * AGTA TACT + * + * However, + * --*-- --*-- + * A-TTA ----> T-AAT + * ATTTA TAAAT + * + */ + if ((i + len ) < querySeq.Length) { + + var qc_pos = originallyReverseComplemented ? i - 1 : i + len; + if (newVariant.InHomopolymer) { + qc_pos = i + len; + } + newVariant.QV = querySeq[qc_pos].QV; + } + variants.Add(newVariant); + i += len; + refPos += len; + } + else + { + if (querySeq[i].BP != refSeq[i]) + { + var newVariant = new SNPVariant(refPos, (char) querySeq[i].BP, (char)refSeq[i]); + newVariant.QV = querySeq [queryPos].QV; + variants.Add(newVariant); + } + i++; refPos++; queryPos++; + } + } + return variants; + } + + /// + /// Converts a subset of bases in the array into a string. + /// + /// + /// + /// + /// + private static string getBases(byte[] array, int position, int length) + { + char[] chars = new char[length]; + for(int i=0; i + /// Gets the bases for a length of the sequence as a string. + /// + /// The bases. + /// Array. + /// Position. + /// Length. + private static string getBases(BPandQV[] array, int position, int length) + { + char[] chars = new char[length]; + for(int i=0; i + /// Returns the length of the homopolymer + /// + /// The homopolymer length. + private static Tuple determineHomoPolymerLength(int pos, byte[] refSeq) + { + + byte start_bp = refSeq[pos]; + int length = 1; + while ( ++pos < refSeq.Length && + refSeq [pos] == start_bp) { + length++; + } + var homopolymerBase = (char)start_bp; + return new Tuple (length, homopolymerBase); + } + } +} + diff --git a/src/Source/Tests/Bio.Tests/Bio.Tests.Core.csproj b/src/Source/Tests/Bio.Tests/Bio.Tests.Core.csproj index b707a81..2203899 100644 --- a/src/Source/Tests/Bio.Tests/Bio.Tests.Core.csproj +++ b/src/Source/Tests/Bio.Tests/Bio.Tests.Core.csproj @@ -218,6 +218,7 @@ + @@ -269,5 +270,6 @@ + \ No newline at end of file diff --git a/src/Source/Tests/Bio.Tests/Variant/VariantCallTests.cs b/src/Source/Tests/Bio.Tests/Variant/VariantCallTests.cs new file mode 100644 index 0000000..7c1c14c --- /dev/null +++ b/src/Source/Tests/Bio.Tests/Variant/VariantCallTests.cs @@ -0,0 +1,307 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +using Bio; +using Bio.Variant; +using Bio.Algorithms.Alignment; +using Bio.Extensions; +using NUnit.Framework; + +namespace Bio.Tests +{ + [TestFixture] + [Category("VariantCalls")] + public static class VariantCallTests + { + /// + /// Converts the Sequence to a QualitativeSequence in the alignment. + /// + /// Aln. + /// Qual scores. + public static void ConvertAlignedSequenceToQualSeq(IPairwiseSequenceAlignment aln, int[] qualScores) { + var q = aln.PairwiseAlignedSequences [0].SecondSequence as Sequence; + var qvs = new int[q.Count]; + int queryPos = 0; + for (int i = 0; i < qvs.Length; i++) { + if (q [i] == '-') { + qvs [i] = 0; + } else { + qvs [i] = qualScores[queryPos++]; + } + } + var qseq = new QualitativeSequence (DnaAlphabet.Instance, FastQFormatType.Sanger, q.ToArray (), qvs, false); + + aln.PairwiseAlignedSequences [0].SecondSequence = qseq; + + } + + [Test] + + public static void Test1BPInsertionCall() + { + string seq1seq = "ATA-CCCTT".Replace("-", String.Empty); + string seq2seq = "ATACCCCTT"; + int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30, 30, 30 }; + var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false); + var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false); + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (refseq, query).First(); + // Need to add in the QV Values. + ConvertAlignedSequenceToQualSeq(aln, seq2qual); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (variants.Count, 1); + var variant = variants.First (); + Assert.AreEqual (4, variant.QV); + Assert.AreEqual (2, variant.StartPosition); + Assert.AreEqual (VariantType.INDEL, variant.Type); + var vi = variant as IndelVariant; + Assert.AreEqual ("C", vi.InsertedOrDeletedBases); + Assert.AreEqual ('C', vi.HomopolymerBase); + Assert.AreEqual (3, vi.HomopolymerLengthInReference); + Assert.AreEqual (true, vi.InHomopolymer); + Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion); + } + + [Test] + public static void Test1BPDeletionCall() + { + string seq1seq = "ATACCCCTT"; + string seq2seq = "ATA-CCCTT".Replace("-", String.Empty); + int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30 }; + var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false); + var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false); + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (refseq, query).First(); + // Need to add in the QV Values. + ConvertAlignedSequenceToQualSeq(aln, seq2qual); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (variants.Count, 1); + var variant = variants.First (); + Assert.AreEqual (2, variant.QV); + Assert.AreEqual (2, variant.StartPosition); + Assert.AreEqual (VariantType.INDEL, variant.Type); + var vi = variant as IndelVariant; + Assert.AreEqual ("C", vi.InsertedOrDeletedBases); + Assert.AreEqual ('C', vi.HomopolymerBase); + Assert.AreEqual (4, vi.HomopolymerLengthInReference); + Assert.AreEqual (true, vi.InHomopolymer); + Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion); + } + + [Test] + public static void TestSNPCall() + { + string seq1seq = "ATCCCCCTT"; + string seq2seq = "ATCCCTCTT"; + int[] seq2qual = new int[] { 30, 30, 30, 30, 5, 3, 30, 30, 30 }; + var refseq = new Sequence(DnaAlphabet.Instance, seq1seq); + var query = new Sequence (DnaAlphabet.Instance, seq2seq); + + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (refseq, query).First(); + ConvertAlignedSequenceToQualSeq (aln, seq2qual); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (variants.Count, 1); + var variant = variants.First (); + Assert.AreEqual (3, variant.QV); + Assert.AreEqual (5, variant.StartPosition); + Assert.AreEqual (variant.Type, VariantType.SNP); + var vi = variant as SNPVariant; + Assert.AreEqual (1, vi.Length); + Assert.AreEqual ('T', vi.AltBP); + Assert.AreEqual ('C', vi.RefBP); + Assert.AreEqual (VariantType.SNP, vi.Type); + Assert.AreEqual (false, vi.AtEndOfAlignment); + } + + [Test] + public static void TestLeftAlignmentStep() { + var refseq = "ACAATAAAAGCGCGCGCGCGTTACGTATAT--ATGGATAT"; + var queryseq = "ACAATAA-AGC--GCGC--GTTACGTATATATATGGATAT"; + + var r = new Sequence (DnaAlphabet.Instance, refseq); + var q = new Sequence (DnaAlphabet.Instance, queryseq); + var aln = new PairwiseSequenceAlignment (r, q); + var pas = new PairwiseAlignedSequence (); + pas.FirstSequence = r; + pas.SecondSequence = q; + aln.Add (pas); + var tpl = VariantCaller.LeftAlignIndelsAndCallVariants (aln, true); + + // Check the left alignment + aln = tpl.Item1 as PairwiseSequenceAlignment; + var lar = aln.PairwiseAlignedSequences [0].FirstSequence.ConvertToString(); + var laq = aln.PairwiseAlignedSequences [0].SecondSequence.ConvertToString(); + var exprefseq = "ACAATAAAAGCGCGCGCGCGTTACG--TATATATGGATAT"; + var expqueryseq = "ACAAT-AAA----GCGCGCGTTACGTATATATATGGATAT"; + Assert.AreEqual (exprefseq, lar); + Assert.AreEqual (expqueryseq, laq); + + // And it's hard, so we might as well check the variants + var variants = tpl.Item2; + Assert.AreEqual (3, variants.Count); + string[] bases = new string[] { "A", "GCGC", "TA" }; + char[] hpbases = new char[] { 'A', 'G', 'T' }; + bool[] inHp = new bool[] { true, false, false }; + int[] lengths = new int[] { 1, 4, 2 }; + int[] starts = new int[] { 4, 8, 24 }; + IndelType[] types = new IndelType[] { IndelType.Deletion, IndelType.Deletion, IndelType.Insertion }; + for (int i = 0; i < 3; i++) { + Assert.AreEqual (VariantType.INDEL, variants [i].Type); + var vi = variants [i] as IndelVariant; + Assert.AreEqual (hpbases[i], vi.HomopolymerBase); + Assert.AreEqual (starts [i], vi.StartPosition); + Assert.AreEqual (lengths [i], vi.Length); + Assert.AreEqual (bases [i], vi.InsertedOrDeletedBases); + Assert.AreEqual (inHp [i], vi.InHomopolymer); + Assert.AreEqual (types [i], vi.InsertionOrDeletion); + + } + + } + + [Test] + public static void TestReverseComplement1BPIndelCall() { + + string seq1seq = "ATACCCCTTGCGC"; + string seq2seq = "ATA-CCCTTGCGC".Replace("-", String.Empty); + int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30, 30, 30, 30, 30 }; + var refseq = new Sequence(DnaAlphabet.Instance, seq1seq); + var query = new Sequence (DnaAlphabet.Instance, seq2seq); + + var s1rc = refseq.GetReverseComplementedSequence (); + var s2rc = query.GetReverseComplementedSequence (); + + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (s1rc, s2rc).First(); + VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ()); + aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement (); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (variants.Count, 1); + var variant = variants.First (); + Assert.AreEqual (2, variant.QV); + Assert.AreEqual (5, variant.StartPosition); + Assert.AreEqual (VariantType.INDEL, variant.Type); + var vi = variant as IndelVariant; + Assert.AreEqual (IndelType.Deletion, vi.InsertionOrDeletion); + Assert.AreEqual ('G', vi.HomopolymerBase); + Assert.AreEqual (1, vi.Length); + Assert.AreEqual (4, vi.HomopolymerLengthInReference); + Assert.AreEqual (true, vi.InHomopolymer); + Assert.AreEqual ("G", vi.InsertedOrDeletedBases); + Assert.AreEqual (false, vi.AtEndOfAlignment); + Assert.AreEqual (6, vi.EndPosition); + } + + + [Test] + public static void TestTrickyQVInversions() { + // This will be hard because normally flip the QV value for a homopolymer, but in this case we won't. + // Note the whole notion of flipping is poorly defined. + string seq1seq = "ATTGC"; + string seq2seq = "ATAGC"; + int[] seq2qual = new int[] { 30, 30, 2, 30, 30 }; + var refseq = new Sequence(DnaAlphabet.Instance, seq1seq); + var query = new Sequence (DnaAlphabet.Instance, seq2seq); + + var s1rc = refseq.GetReverseComplementedSequence (); + var s2rc = query.GetReverseComplementedSequence (); + + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (s1rc, s2rc).First(); + VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ()); + aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement (); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (1, variants.Count); + var variant = variants.First (); + Assert.AreEqual (VariantType.SNP, variant.Type); + Assert.AreEqual (2, variant.QV); + + var vs = variant as SNPVariant; + Assert.AreEqual ('T', vs.AltBP); + Assert.AreEqual ('A', vs.RefBP); + } + + [Test] + public static void TestInsertionAtEndofHP() { + string seq1seq = "ATA-CCC".Replace("-", String.Empty); + string seq2seq = "ATACCCC"; + int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30 }; + var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false); + var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false); + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (refseq, query).First(); + // Need to add in the QV Values. + ConvertAlignedSequenceToQualSeq(aln, seq2qual); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (variants.Count, 1); + var variant = variants.First (); + Assert.AreEqual (4, variant.QV); + Assert.AreEqual (2, variant.StartPosition); + Assert.AreEqual (VariantType.INDEL, variant.Type); + var vi = variant as IndelVariant; + Assert.AreEqual ("C", vi.InsertedOrDeletedBases); + Assert.AreEqual ('C', vi.HomopolymerBase); + Assert.AreEqual (true, vi.AtEndOfAlignment); + Assert.AreEqual (3, vi.HomopolymerLengthInReference); + Assert.AreEqual (true, vi.InHomopolymer); + Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion); + } + + + [Test] + public static void TestDeletionAtEndofHP() { + string seq1seq = "ATACCCC"; + string seq2seq = "ATA-CCC".Replace("-", String.Empty); + int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30 }; + var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false); + var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false); + NeedlemanWunschAligner aligner = new NeedlemanWunschAligner (); + var aln = aligner.Align (refseq, query).First(); + // Need to add in the QV Values. + ConvertAlignedSequenceToQualSeq(aln, seq2qual); + var variants = VariantCaller.CallVariants (aln); + Assert.AreEqual (variants.Count, 1); + var variant = variants.First (); + Assert.AreEqual (4, variant.QV); + Assert.AreEqual (2, variant.StartPosition); + Assert.AreEqual (VariantType.INDEL, variant.Type); + var vi = variant as IndelVariant; + Assert.AreEqual ("C", vi.InsertedOrDeletedBases); + Assert.AreEqual ('C', vi.HomopolymerBase); + Assert.AreEqual (true, vi.AtEndOfAlignment); + Assert.AreEqual (4, vi.HomopolymerLengthInReference); + Assert.AreEqual (true, vi.InHomopolymer); + Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion); + } + + [Test] + public static void TestExceptionThrownForUnclippedAlignment() { + var refseq = "ACAATATA"; + var queryseq = "ACAATAT-"; + + var r = new Sequence (DnaAlphabet.Instance, refseq); + var q = new Sequence (DnaAlphabet.Instance, queryseq); + var aln = new PairwiseSequenceAlignment (r, q); + var pas = new PairwiseAlignedSequence (); + pas.FirstSequence = r; + pas.SecondSequence = q; + aln.Add (pas); + Assert.Throws (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true)); + + refseq = "AAACAATATA"; + queryseq = "AA-CAATATA"; + + r = new Sequence (DnaAlphabet.Instance, refseq); + q = new Sequence (DnaAlphabet.Instance, queryseq); + aln = new PairwiseSequenceAlignment (r, q); + pas = new PairwiseAlignedSequence (); + pas.FirstSequence = r; + pas.SecondSequence = q; + aln.Add (pas); + Assert.Throws (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true)); + } + } +} +