This collection of code allows users to call variants with QV values
from IPairwiseAlignedSequence objects that contain either regular
sequences or sequences with associated quality values.  Examples of how
to use it are provided in the tests shown.
This commit is contained in:
Nigel Delaney 2015-08-04 19:21:51 -07:00
Родитель 885fef9064
Коммит 3adad2773c
9 изменённых файлов: 1245 добавлений и 0 удалений

Просмотреть файл

@ -433,6 +433,12 @@
<Compile Include="WordMatch.cs" />
<Compile Include="IO\PacBio\PacBioCCSRead.cs" />
<Compile Include="IO\PacBio\PacBioCCSBamReader.cs" />
<Compile Include="Variant\AlignmentUtils.cs" />
<Compile Include="Variant\BPandQV.cs" />
<Compile Include="Variant\IndelVariant.cs" />
<Compile Include="Variant\SNPVariant.cs" />
<Compile Include="Variant\Variant.cs" />
<Compile Include="Variant\VariantCaller.cs" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="CloneLibrary\Resources\Library.txt" />
@ -478,5 +484,6 @@ cp "$(TargetDir)\$(TargetName).xml" "$(SolutionDir)\bin\$(TargetName).xml"</Post
-->
<ItemGroup>
<Folder Include="IO\PacBio\" />
<Folder Include="Variant\" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,265 @@
using System;
using Bio.Algorithms.Alignment;
using System.Collections.Generic;
using System.Linq;
using Bio.Extensions;
using System.Diagnostics;
namespace Bio.Variant
{
/// <summary>
/// Utilities to manipulate and work with alignments when calling variants.
/// </summary>
public static class AlignmentUtils
{
/// <summary>
/// The maximum number of times we can shift an indel before we fail.
/// It should never go anywhere near this number and this is only implemented as a guard.
/// </summary>
private const int MAX_LOOPS = 50;
private const string MAX_LOOPS_ERROR = "The Left-align indel step did not converge within 50 iterations";
/// <summary>
/// Verifies the alignment has no leading or trailing gaps and throws an exception otherwise.
/// </summary>
/// <param name="refseq">Refseq.</param>
/// <param name="query">Query.</param>
internal static void VerifyNoGapsOnEnds(byte[] refseq, BPandQV[] query) {
var gap = DnaAlphabet.Instance.Gap;
if (refseq [0] == gap ||
refseq [refseq.Length - 1] == gap ||
query [0].BP == gap ||
query [query.Length - 1].BP == gap) {
throw new FormatException ("Alignment query and/or reference started with a gap character. " +
"Alignments must be hard-clipped to remove starting and trailing variants " +
"before variants can be called");
}
}
/// <summary>
/// Given two byte arrays representing a pairwise alignment, shift them so
/// that all deletions start as early as possible. For example:
/// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT
/// TTTTAA--TTTT TTTT--AATTTT
///
/// This modifies the array in place.
/// </summary>
/// <param name="refseq">Reference Sequency</param>
/// <param name="query">Query Sequence</param>
/// <returns></returns>
public static void LeftAlignIndels(byte[] refseq, BPandQV[] query)
{
// Validation
if (refseq.Length != query.Length) {
throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences");
}
ValidateNoOverlappingGaps (refseq, query);
byte gap = DnaAlphabet.Instance.Gap;
// Keep left aligning until we can't anymore, this is a
// do while loop because some downstream left alignments open up
// further ones upstream, even though this is rare.
int change_count = 0;
int loopsThrough = 0;
do
{
loopsThrough++;
change_count = 0;
for (int i = 1; i < refseq.Length; i++)
{
if (refseq[i] == gap)
{
int len = GetGapLength(i, refseq);
int left_side = i - 1;
int right_side = i - 1 + len;
while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP))
{
// Move the gap left.
if (right_side < refseq.Length) {
refseq[right_side] = refseq[left_side];
}
refseq[left_side] = gap;
left_side--;
right_side--;
change_count++;
}
if (loopsThrough > MAX_LOOPS) {
throw new Exception(MAX_LOOPS_ERROR);
}
}
else if (query[i].BP == gap)
{
int len = GetGapLength(i, query);
int left_side = i - 1;
int right_side = i - 1 + len;
while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side]))
{
// Move the gap left.
if (right_side < query.Length) {
query[right_side] = query[left_side];
}
query[left_side] = new BPandQV(gap, 0);
left_side--;
right_side--;
change_count++;
}
if (loopsThrough > MAX_LOOPS) {
throw new Exception(MAX_LOOPS_ERROR);
}
}
}
} while (change_count > 0);
}
/// <summary>
/// Given the start position of a gap, returns how long it is.
/// For example:
///
/// AAAA---TTTT returns 3.
/// </summary>
/// <param name="pos">0 indexed</param>
/// <param name="array"></param>
/// <returns></returns>
public static int GetGapLength(int pos, BPandQV[] array)
{
var gap = DnaAlphabet.Instance.Gap;
int len = 1;
while (++pos < array.Length)
{
if (array[pos].BP == gap)
{
len += 1;
}
else
{
break;
}
}
return len;
}
/// <summary>
/// Given the start position of a gap, returns how long it is.
/// For example:
///
/// AAAA---TTTT returns 3.
/// </summary>
/// <param name="pos">0 indexed</param>
/// <param name="array"></param>
/// <returns></returns>
public static int GetGapLength(int pos, byte[] array)
{
var gap = DnaAlphabet.Instance.Gap;
int len = 1;
while (++pos < array.Length)
{
if (array[pos] == gap)
{
len += 1;
}
else
{
break;
}
}
return len;
}
/// <summary>
/// Simple check that the alignment does not have a gap on top of a
/// gap, which violates several assumptions.
/// </summary>
/// <param name="seq1"></param>
/// <param name="seq2"></param>
internal static void ValidateNoOverlappingGaps(byte[] seq1, BPandQV[] seq2)
{
var gap = DnaAlphabet.Instance.Gap;
for(int i=0;i<seq1.Length;i++)
{
if (seq1[i] == gap && seq2[i].BP == gap)
throw new Exception("You have an alignment with overlapping gaps. Input problem!");
}
}
/// <summary>
/// Reverse complements the BP and QV values accounting for homopolymers.
///
/// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
/// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
/// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the
/// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
/// </summary>
/// <returns>The reverse complemented sequence.</returns>
/// <param name="toFlip">The array with the QV values to flip.</param>
/// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param>
internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false)
{
BPandQV[] newData = new BPandQV[toFlip.Length];
for (long index = 0; index < toFlip.Length; index++)
{
byte complementedSymbol;
byte symbol = toFlip[toFlip.Length - index - 1].BP;
if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol))
{
throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString());
}
var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index -1].QV);
newData [index] = bpandq;
}
if (flipHpQvValues) {
ReverseQVValuesForHomopolymers (newData);
}
return newData;
}
/// <summary>
/// Reverses the QV values for homopolymers.
/// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
/// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
/// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the
/// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
/// </summary>
/// <returns>The QV values for homopolymers.</returns>
/// <param name="toFlip">To flip.</param>
internal static void ReverseQVValuesForHomopolymers(BPandQV[] toFlip) {
// Basic idea is to assume it is A, C, G, T alphabet and flip HP values
// Also assumes all low QV is due to HP deletion/insertion error.
if (toFlip.Length > 1) {
byte lastbp = toFlip [0].BP;
int firstPos = 0;
int curLength = 1;
for (int i = 1; i < toFlip.Length; i++) {
byte newbp = toFlip [i].BP;
if (newbp != lastbp) {
if (curLength > 1) {
var right = toFlip [i - 1];
var left = toFlip[firstPos];
Debug.Assert (right.BP == left.BP);
if (right.QV < left.QV) {
toFlip [i - 1] = left;
toFlip [firstPos] = right;
}
}
firstPos = i;
lastbp = newbp;
curLength = 1;
} else if (newbp == lastbp) {
curLength++;
}
}
// Finally flip the end
if (curLength > 1) {
var tmp = toFlip [toFlip.Length - 1];
toFlip [toFlip.Length - 1] = toFlip [firstPos];
toFlip [firstPos] = tmp;
}
}
}
}
}

Просмотреть файл

@ -0,0 +1,43 @@
using System;
namespace Bio.Variant
{
/// <summary>
/// A Basepair and corresponding QV value, used to keep the two values together.
/// </summary>
public struct BPandQV
{
/// <summary>
/// A, C, G, T or '-'
/// </summary>
public readonly byte BP;
/// <summary>
/// A QV Value for the base. Should be set to 0 if no information is available
/// or is a gap character.
/// </summary>
public readonly byte QV;
/// <summary>
/// Initializes a new instance of the <see cref="Bio.Variant.BPandQV"/> struct.
/// </summary>
/// <param name="bp">Bp.</param>
/// <param name="qv">QV value</param>
/// <param name="validate">If set to <c>true</c> validate that the BP is an A, C, G or T</param>
public BPandQV (byte bp, byte qv, bool validate=true)
{
var bp2 = (byte)'-';
if (validate &&
!(bp != 'A' ||
bp != 'C' ||
bp != 'G' ||
bp != 'T' ||
bp != '-')) {
throw new ArgumentOutOfRangeException ("bp", "Basepairs must be A, C, G, T or -");
}
BP = bp;
QV = qv;
}
}
}

Просмотреть файл

@ -0,0 +1,120 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Bio;
namespace Bio.Variant
{
/// <summary>
/// The two types of indels, insertions or deletions.
/// </summary>
[Flags]
public enum IndelType : byte {
/// <summary>
/// An insertion relative to the reference.
/// </summary>
Insertion,
/// <summary>
/// A deletion relative to the reference.
/// </summary>
Deletion}
/// <summary>
/// An Indel Variant.
/// </summary>
public class IndelVariant : Variant
{
/// <summary>
/// Insertion or Deletion
/// </summary>
public readonly IndelType InsertionOrDeletion;
/// <summary>
/// These are the bases in the insertion (or what was deleted).
/// </summary>
public readonly string InsertedOrDeletedBases;
/// <summary>
/// True if the homopolymer length of the reference is > 1.
/// <code>
/// e.g. ACC -> TRUE
/// A-C
/// </code>
/// </summary>
/// <value><c>true</c> if in homo polymer; otherwise, <c>false</c>.</value>
public bool InHomopolymer {
get {
return HomopolymerLengthInReference > 1;
}
}
///
/// At the position of the insertion or deletion, what is the length of the nearest homopolymer run in the reference.
/// For insertions, the HP length is defined by the next base in the reference.
///
/// <code>
/// For example, the insertion below occurs at an HP of char 'C' and length 3.
///
/// ref: AT-CCCTG
/// query: ATCCCCTG
///
/// For multiple insertions or a long deletion, only the HP of the first position is counted,
/// so the example below would be a 2 bp long 'C'
///
///
/// ref: ATCCTGCATA
/// query: AT-----ATA
///
///
/// </code>
///
public int HomopolymerLengthInReference {
get;
set;
}
/// <summary>
/// A, C, G or T the basepair of the homopolymer.
/// </summary>
/// <value>The homopolymer base.</value>
public char HomopolymerBase {
get;
set;
}
/// <summary>
/// Initializes a new instance of the <see cref="Bio.Variant.IndelVariant"/> class.
/// </summary>
/// <param name="position">Position.</param>
/// <param name="length">Length.</param>
/// <param name="bases">Bases.</param>
/// <param name="insertionOrDeletion">Insertion or deletion.</param>
/// <param name="hpBase">Hp base.</param>
/// <param name="hpLengthInReference">Hp length in reference.</param>
/// <param name="atAlignmentEnd">If set to <c>true</c> at alignment end.</param>
public IndelVariant(int position, int length, string bases, IndelType insertionOrDeletion, char hpBase, int hpLengthInReference, bool atAlignmentEnd = false)
: base (position, atAlignmentEnd)
{
Type = VariantType.INDEL;
InsertionOrDeletion = insertionOrDeletion;
Length = length;
InsertedOrDeletedBases = bases;
HomopolymerBase = hpBase;
HomopolymerLengthInReference = hpLengthInReference;
}
/// <summary>
/// Returns a <see cref="System.String"/> that represents the current <see cref="Bio.Variant.IndelVariant"/>.
/// </summary>
/// <returns>A <see cref="System.String"/> that represents the current <see cref="Bio.Variant.IndelVariant"/>.</returns>
public override string ToString ()
{
var insert = this.InsertionOrDeletion == IndelType.Deletion ? "Deletion" : "Insertion";
return string.Format ("[IndelVariant: InHomopolymer={0}, HomopolymerLengthInReference={1}, HomopolymerBase={2}, Type={3}]", InHomopolymer, HomopolymerLengthInReference, HomopolymerBase, insert);
}
}
}

Просмотреть файл

@ -0,0 +1,51 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Diagnostics;
using Bio;
namespace Bio.Variant
{
/// <summary>
/// A class to store information about a SNP variant.
/// </summary>
public class SNPVariant : Variant
{
/// <summary>
/// The BP in the reference at this position.
/// </summary>
public readonly char RefBP;
/// <summary>
/// The variant present at this position.
/// </summary>
public readonly char AltBP;
/// <summary>
/// Create a new SNP in the given reference at the given position.
/// </summary>
/// <param name="position">0-based position on reference.</param>
/// <param name="altAllele">The variant present (A, C, G, T)</param>
/// <param name="refBP">The Reference sequence.</param>
public SNPVariant(int position, char altAllele, char refBP) :
base (position)
{
AltBP = altAllele;
Type = VariantType.SNP;
RefBP = (char) refBP;
Length = 1;
}
/// <summary>
/// Returns a <see cref="System.String"/> that represents the current <see cref="Bio.Variant.SNPVariant"/>.
/// </summary>
/// <returns>A <see cref="System.String"/> that represents the current <see cref="Bio.Variant.SNPVariant"/>.</returns>
public override string ToString ()
{
return RefBP + "->" + AltBP + " @ " + StartPosition;
}
}
}

Просмотреть файл

@ -0,0 +1,179 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Bio;
namespace Bio.Variant
{
/// <summary>
/// Top level class for holding variant information.
/// This is implemented in two sub classes, SNPVariant and IndelVariant.
/// </summary>
public abstract class Variant : IComparable<Variant>, IEquatable<Variant>
{
/// <summary>
/// The name of the reference sequence the variant was called from.
/// </summary>
public string RefName;
/// <summary>
/// SNP, indel, or complex.
/// </summary>
public VariantType Type { get; protected set; }
/// <summary>
/// 0-based start position of variant.
/// <code>
/// For Indels, this is the left most position
/// BEFORE the event. e.g.
/// <code>
/// AAATTTAAA -> is a deletion of length 3 starting at position 2.
/// AAA---AAA
///
/// A--TTAAA -> is an insertion of length 2 starting at position 0.
/// ACCTTAAA
///
/// ACCTTAAA -> is a deletion at position 3.
/// ACC-TAAA
/// </code>
///
/// </summary>
public int StartPosition { get; set; }
/// <summary>
/// O-based end index (same as start for SNPs).
/// A SNP is of length 1, a deletion of length 2 is 2, etc.
/// </summary>
public int Length { get; protected set; }
/// <summary>
/// The position in the alignment where this variant ends, exclusive.
/// For a SNP, this is the position AFTER the SNP. Same for indels.
///
/// <code>
/// AACCTT -> End Position = 3
/// AAGCTT
/// </code>
///
/// </summary>
public int EndPosition
{
get { return StartPosition + Length; }
}
/// <summary>
/// Is the variant at the very start or end of an alignment?
/// (that is it was called based on the first or last base seen on
/// either sequence in the alignment.)
/// These can have certain pathologies so we take note and keep an eye on them.
/// They should almost always be excluded by the alignment algorithm clipping at the ends.
/// </summary>
public bool AtEndOfAlignment { get; protected set; }
/// <summary>
/// The QV value for this call, if it exists. If not, this is set to 0.
/// </summary>
/// <value>The QV value.</value>
public byte QV {get; set;}
/// <summary>
/// Initializes a new instance of the <see cref="Bio.Variant.Variant"/> class.
/// </summary>
/// <param name="position">Position.</param>
/// <param name="atAlignmentEnd">If set to <c>true</c> at alignment end.</param>
public Variant(int position, bool atAlignmentEnd = false)
{
StartPosition = position;
AtEndOfAlignment = atAlignmentEnd;
}
/// <summary>
/// Serves as a hash function for a <see cref="Bio.Variant.Variant"/> object.
/// </summary>
/// <returns>A hash code for this instance that is suitable for use in hashing algorithms and data structures such as a
/// hash table.</returns>
public override int GetHashCode ()
{
//TODO: This is a little unsafe as these fields are mutable...
var h1 = RefName != null ? RefName.GetHashCode () : 0;
var h2 = StartPosition.GetHashCode ();
var h3 = EndPosition.GetHashCode ();
return h1 ^ h2 ^ h3;
}
/// <summary>
/// Determines whether the specified <see cref="System.Object"/> is equal to the current <see cref="Bio.Variant.Variant"/>.
///
/// Two variants are "equal" if they have the same start and end positions and are of the same type.
/// </summary>
/// <param name="obj">The <see cref="System.Object"/> to compare with the current <see cref="Bio.Variant.Variant"/>.</param>
/// <returns><c>true</c> if the specified <see cref="System.Object"/> is equal to the current
/// <see cref="Bio.Variant.Variant"/>; otherwise, <c>false</c>.</returns>
public override bool Equals (object obj)
{
var res = obj as Variant;
if (res != null)
return this.Equals (res);
return false;
}
#region IComparable implementation
/// <Docs>To be added.</Docs>
/// <para>Returns the sort order (Reference Name, Start Position) of the current variant compared to the specified object.</para>
/// <summary>
///
/// </summary>
/// <returns>The to.</returns>
/// <param name="other">Other.</param>
public int CompareTo (Variant other)
{
var nameComp = String.CompareOrdinal(RefName,other.RefName);
if (nameComp == 0) {
return StartPosition.CompareTo (other.StartPosition);
}
return nameComp;
}
#endregion
#region IEquatable implementation
/// <summary>
/// Determines whether the specified <see cref="Bio.Variant.Variant"/> is equal to the current <see cref="Bio.Variant.Variant"/>.
///
/// Two variants are equal if they have the same reference, start/end and type.
/// </summary>
/// <param name="other">The <see cref="Bio.Variant.Variant"/> to compare with the current <see cref="Bio.Variant.Variant"/>.</param>
/// <returns><c>true</c> if the specified <see cref="Bio.Variant.Variant"/> is equal to the current
/// <see cref="Bio.Variant.Variant"/>; otherwise, <c>false</c>.</returns>
bool IEquatable<Variant>.Equals (Variant other)
{
var otherComp = CompareTo (other);
if (otherComp == 0) {
return this.StartPosition == other.StartPosition && this.Type == other.Type && this.EndPosition == other.EndPosition;
}
return false;
}
#endregion
}
/// <summary>
/// The type of variant, either SNP, INDEL or Complex.
/// </summary>
public enum VariantType
{
/// <summary>
/// A SNP variant.
/// </summary>
SNP,
/// <summary>
/// And Indel variant.
/// </summary>
INDEL,
/// <summary>
/// A Complex variant. Not currently used but a place holder for more complex scenarios in the future.
/// </summary>
Complex
}
}

Просмотреть файл

@ -0,0 +1,271 @@
using System;
using System.Linq;
using System.Collections.Generic;
using System.Diagnostics;
using Bio;
using Bio.Extensions;
using Bio.Algorithms.Alignment;
namespace Bio.Variant
{
/// <summary>
/// This class takes alignments
/// and generates a list of variants.
/// </summary>
public static class VariantCaller
{
/// <summary>
/// Given two byte arrays representing a pairwise alignment, shift them so
/// that all deletions start as early as possible. For example:
///
/// <code>
/// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT
/// TTTTAA--TTTT TTTT--AATTTT
/// </code>
///
/// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second
/// sequence is the query. It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants.
/// </summary>
/// <param name="aln">Aln. The second sequence should be of type QualitativeSequence or Sequence</param>
/// <param name="callVariants">callVariants. If true, it will call variants, otherwise the second half of tuple will be null. </param>
public static Tuple<IPairwiseSequenceAlignment, List<Variant>> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) {
if (aln == null) {
throw new NullReferenceException ("aln");
}
if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) {
throw new ArgumentException ("The pairwise aligned sequence should only have one alignment");
}
var frstAln = aln.PairwiseAlignedSequences.First ();
var seq1 = frstAln.FirstSequence;
var seq2 = frstAln.SecondSequence;
if (seq1 == null) {
throw new NullReferenceException ("seq1");
} else if (seq2 == null) {
throw new NullReferenceException ("seq2");
}
//TODO: Might implement an ambiguity check later.
#if FALSE
if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) {
throw new ArgumentException ("Cannot left align sequences with ambiguous symbols.");
}
#endif
// Note we have to copy unless we can guarantee the array will not be mutated.
byte[] refseq = seq1.ToArray ();
ISequence newQuery;
List<Variant> variants = null;
// Call variants for a qualitative sequence
if (seq2 is QualitativeSequence) {
var qs = seq2 as QualitativeSequence;
var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray ();
AlignmentUtils.LeftAlignIndels (refseq, query);
AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
if (callVariants) {
variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
}
var newQueryQS = new QualitativeSequence (qs.Alphabet,
qs.FormatType,
query.Select (z => z.BP).ToArray (),
query.Select (p => p.QV).ToArray (),
false);
newQueryQS.Metadata = seq2.Metadata;
newQuery = newQueryQS;
} else if (seq2 is Sequence) { // For a sequence with no QV values.
var qs = seq2 as Sequence;
var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray();
AlignmentUtils.LeftAlignIndels (refseq, query);
AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
// ISequence does not have a setable metadata
var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false);
newQueryS.Metadata = seq2.Metadata;
if (callVariants) {
variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
}
newQuery = newQueryS;
} else {
throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence.");
}
if (aln.FirstSequence != null && aln.FirstSequence.ID != null) {
foreach (var v in variants) {
v.RefName = aln.FirstSequence.ID;
}
}
var newRef = new Sequence (seq1.Alphabet, refseq, false);
newRef.ID = seq1.ID;
newRef.Metadata = seq1.Metadata;
newQuery.ID = seq2.ID;
var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence);
var pas = new PairwiseAlignedSequence ();
pas.FirstSequence = newRef;
pas.SecondSequence = newQuery;
newaln.Add (pas);
return new Tuple<IPairwiseSequenceAlignment, List<Variant>> (newaln, variants);
}
/// <summary>
/// Given a pairwise sequence alignment, call variants, producing
/// a list of SNPs and Indels found in the alignment.
///
/// This method will first left-align all variants before calling to be consistent with other
/// software. The
/// </summary>
/// <param name="aln">The Pairwise alignment to call variants with.</param>
/// <returns></returns>
///
public static List<Variant> CallVariants(IPairwiseSequenceAlignment aln) {
return LeftAlignIndelsAndCallVariants (aln).Item2;
}
/// <summary>
/// Calls the variants.
///
/// Should only be used internally as assumptions are made that the alignments are left-aligned and fulfill certain criteria.
/// </summary>
/// <returns>The variants.</returns>
/// <param name="refSeq">Reference seq.</param>
/// <param name="querySeq">Query seq.</param>
/// <param name="originallyReverseComplemented">If set to <c>true</c> the query sequence was originally reverse complemented. (this affects QV value scoring)</param>
internal static List<Variant> CallVariants(byte[] refSeq, BPandQV[] querySeq, bool originallyReverseComplemented)
{
if (originallyReverseComplemented) {
AlignmentUtils.ReverseQVValuesForHomopolymers (querySeq);
}
List<Variant> variants = new List<Variant>();
// Now call variants.
var gap = DnaAlphabet.Instance.Gap;
int i = 0;
int refPos = 0;
int queryPos = 0;
while( i < refSeq.Length)
{
if (refSeq[i] == gap)
{
int len = AlignmentUtils.GetGapLength(i, refSeq);
var nextBasePos = (i + len);
// Should alway be true as we don't end in gaps
Debug.Assert (nextBasePos < refSeq.Length);
var hplenAndChar = determineHomoPolymerLength (nextBasePos, refSeq);
var bases = getBases(querySeq, i, len);
var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Insertion,
hplenAndChar.Item2, hplenAndChar.Item1,
(i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
newVariant.QV = querySeq[queryPos].QV;
variants.Add(newVariant);
i += len;
queryPos += len;
}
else if (querySeq[i].BP == gap)
{
int len = AlignmentUtils.GetGapLength(i, querySeq);
var bases = getBases(refSeq, i, len);
var hplenAndChar = determineHomoPolymerLength (i, refSeq);
var newVariant = new IndelVariant(refPos - 1, len, bases,
IndelType.Deletion, hplenAndChar.Item2,
hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
/* An insertion mutation occurs BEFORE pos, so normally we get the next base
* or the last one if it's a reverse complemented alignment. However, this is not true if
* it is a homopolymer because what would have been the previous position is the next position
* after left aligning and reversing the position of the QV value.
*
* Consider the following
* --*- -*--
* A-TA --> TA-T
* AGTA TACT
*
* However,
* --*-- --*--
* A-TTA ----> T-AAT
* ATTTA TAAAT
*
*/
if ((i + len ) < querySeq.Length) {
var qc_pos = originallyReverseComplemented ? i - 1 : i + len;
if (newVariant.InHomopolymer) {
qc_pos = i + len;
}
newVariant.QV = querySeq[qc_pos].QV;
}
variants.Add(newVariant);
i += len;
refPos += len;
}
else
{
if (querySeq[i].BP != refSeq[i])
{
var newVariant = new SNPVariant(refPos, (char) querySeq[i].BP, (char)refSeq[i]);
newVariant.QV = querySeq [queryPos].QV;
variants.Add(newVariant);
}
i++; refPos++; queryPos++;
}
}
return variants;
}
/// <summary>
/// Converts a subset of bases in the array into a string.
/// </summary>
/// <param name="array"></param>
/// <param name="position"></param>
/// <param name="length"></param>
/// <returns></returns>
private static string getBases(byte[] array, int position, int length)
{
char[] chars = new char[length];
for(int i=0; i<length; i++)
{
chars[i] = (char)array[i+position];
}
return new string(chars);
}
/// <summary>
/// Gets the bases for a length of the sequence as a string.
/// </summary>
/// <returns>The bases.</returns>
/// <param name="array">Array.</param>
/// <param name="position">Position.</param>
/// <param name="length">Length.</param>
private static string getBases(BPandQV[] array, int position, int length)
{
char[] chars = new char[length];
for(int i=0; i<length; i++)
{
chars[i] = (char)array[i+position].BP;
}
return new string(chars);
}
/// <summary>
/// Returns the length of the homopolymer
/// </summary>
/// <returns>The homopolymer length.</returns>
private static Tuple<int, char> determineHomoPolymerLength(int pos, byte[] refSeq)
{
byte start_bp = refSeq[pos];
int length = 1;
while ( ++pos < refSeq.Length &&
refSeq [pos] == start_bp) {
length++;
}
var homopolymerBase = (char)start_bp;
return new Tuple<int,char> (length, homopolymerBase);
}
}
}

Просмотреть файл

@ -218,6 +218,7 @@
<Compile Include="Algorithms\Padena\PadenaBvtTestCases.cs" />
<Compile Include="Algorithms\Padena\PadenaP1TestCases.cs" />
<Compile Include="PacBio\ParserTests.cs" />
<Compile Include="Variant\VariantCallTests.cs" />
</ItemGroup>
<ItemGroup>
<Content Include="..\TestData\Testcases.xml">
@ -269,5 +270,6 @@
<ItemGroup />
<ItemGroup>
<Folder Include="PacBio\" />
<Folder Include="Variant\" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,307 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Bio;
using Bio.Variant;
using Bio.Algorithms.Alignment;
using Bio.Extensions;
using NUnit.Framework;
namespace Bio.Tests
{
[TestFixture]
[Category("VariantCalls")]
public static class VariantCallTests
{
/// <summary>
/// Converts the Sequence to a QualitativeSequence in the alignment.
/// </summary>
/// <param name="aln">Aln.</param>
/// <param name="qualScores">Qual scores.</param>
public static void ConvertAlignedSequenceToQualSeq(IPairwiseSequenceAlignment aln, int[] qualScores) {
var q = aln.PairwiseAlignedSequences [0].SecondSequence as Sequence;
var qvs = new int[q.Count];
int queryPos = 0;
for (int i = 0; i < qvs.Length; i++) {
if (q [i] == '-') {
qvs [i] = 0;
} else {
qvs [i] = qualScores[queryPos++];
}
}
var qseq = new QualitativeSequence (DnaAlphabet.Instance, FastQFormatType.Sanger, q.ToArray (), qvs, false);
aln.PairwiseAlignedSequences [0].SecondSequence = qseq;
}
[Test]
public static void Test1BPInsertionCall()
{
string seq1seq = "ATA-CCCTT".Replace("-", String.Empty);
string seq2seq = "ATACCCCTT";
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (4, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (3, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion);
}
[Test]
public static void Test1BPDeletionCall()
{
string seq1seq = "ATACCCCTT";
string seq2seq = "ATA-CCCTT".Replace("-", String.Empty);
int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (2, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion);
}
[Test]
public static void TestSNPCall()
{
string seq1seq = "ATCCCCCTT";
string seq2seq = "ATCCCTCTT";
int[] seq2qual = new int[] { 30, 30, 30, 30, 5, 3, 30, 30, 30 };
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
ConvertAlignedSequenceToQualSeq (aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (3, variant.QV);
Assert.AreEqual (5, variant.StartPosition);
Assert.AreEqual (variant.Type, VariantType.SNP);
var vi = variant as SNPVariant;
Assert.AreEqual (1, vi.Length);
Assert.AreEqual ('T', vi.AltBP);
Assert.AreEqual ('C', vi.RefBP);
Assert.AreEqual (VariantType.SNP, vi.Type);
Assert.AreEqual (false, vi.AtEndOfAlignment);
}
[Test]
public static void TestLeftAlignmentStep() {
var refseq = "ACAATAAAAGCGCGCGCGCGTTACGTATAT--ATGGATAT";
var queryseq = "ACAATAA-AGC--GCGC--GTTACGTATATATATGGATAT";
var r = new Sequence (DnaAlphabet.Instance, refseq);
var q = new Sequence (DnaAlphabet.Instance, queryseq);
var aln = new PairwiseSequenceAlignment (r, q);
var pas = new PairwiseAlignedSequence ();
pas.FirstSequence = r;
pas.SecondSequence = q;
aln.Add (pas);
var tpl = VariantCaller.LeftAlignIndelsAndCallVariants (aln, true);
// Check the left alignment
aln = tpl.Item1 as PairwiseSequenceAlignment;
var lar = aln.PairwiseAlignedSequences [0].FirstSequence.ConvertToString();
var laq = aln.PairwiseAlignedSequences [0].SecondSequence.ConvertToString();
var exprefseq = "ACAATAAAAGCGCGCGCGCGTTACG--TATATATGGATAT";
var expqueryseq = "ACAAT-AAA----GCGCGCGTTACGTATATATATGGATAT";
Assert.AreEqual (exprefseq, lar);
Assert.AreEqual (expqueryseq, laq);
// And it's hard, so we might as well check the variants
var variants = tpl.Item2;
Assert.AreEqual (3, variants.Count);
string[] bases = new string[] { "A", "GCGC", "TA" };
char[] hpbases = new char[] { 'A', 'G', 'T' };
bool[] inHp = new bool[] { true, false, false };
int[] lengths = new int[] { 1, 4, 2 };
int[] starts = new int[] { 4, 8, 24 };
IndelType[] types = new IndelType[] { IndelType.Deletion, IndelType.Deletion, IndelType.Insertion };
for (int i = 0; i < 3; i++) {
Assert.AreEqual (VariantType.INDEL, variants [i].Type);
var vi = variants [i] as IndelVariant;
Assert.AreEqual (hpbases[i], vi.HomopolymerBase);
Assert.AreEqual (starts [i], vi.StartPosition);
Assert.AreEqual (lengths [i], vi.Length);
Assert.AreEqual (bases [i], vi.InsertedOrDeletedBases);
Assert.AreEqual (inHp [i], vi.InHomopolymer);
Assert.AreEqual (types [i], vi.InsertionOrDeletion);
}
}
[Test]
public static void TestReverseComplement1BPIndelCall() {
string seq1seq = "ATACCCCTTGCGC";
string seq2seq = "ATA-CCCTTGCGC".Replace("-", String.Empty);
int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30, 30, 30, 30, 30 };
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
var s1rc = refseq.GetReverseComplementedSequence ();
var s2rc = query.GetReverseComplementedSequence ();
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (s1rc, s2rc).First();
VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ());
aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement ();
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (2, variant.QV);
Assert.AreEqual (5, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual (IndelType.Deletion, vi.InsertionOrDeletion);
Assert.AreEqual ('G', vi.HomopolymerBase);
Assert.AreEqual (1, vi.Length);
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual ("G", vi.InsertedOrDeletedBases);
Assert.AreEqual (false, vi.AtEndOfAlignment);
Assert.AreEqual (6, vi.EndPosition);
}
[Test]
public static void TestTrickyQVInversions() {
// This will be hard because normally flip the QV value for a homopolymer, but in this case we won't.
// Note the whole notion of flipping is poorly defined.
string seq1seq = "ATTGC";
string seq2seq = "ATAGC";
int[] seq2qual = new int[] { 30, 30, 2, 30, 30 };
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
var s1rc = refseq.GetReverseComplementedSequence ();
var s2rc = query.GetReverseComplementedSequence ();
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (s1rc, s2rc).First();
VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ());
aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement ();
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (1, variants.Count);
var variant = variants.First ();
Assert.AreEqual (VariantType.SNP, variant.Type);
Assert.AreEqual (2, variant.QV);
var vs = variant as SNPVariant;
Assert.AreEqual ('T', vs.AltBP);
Assert.AreEqual ('A', vs.RefBP);
}
[Test]
public static void TestInsertionAtEndofHP() {
string seq1seq = "ATA-CCC".Replace("-", String.Empty);
string seq2seq = "ATACCCC";
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (4, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (true, vi.AtEndOfAlignment);
Assert.AreEqual (3, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion);
}
[Test]
public static void TestDeletionAtEndofHP() {
string seq1seq = "ATACCCC";
string seq2seq = "ATA-CCC".Replace("-", String.Empty);
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30 };
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
var aln = aligner.Align (refseq, query).First();
// Need to add in the QV Values.
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
var variants = VariantCaller.CallVariants (aln);
Assert.AreEqual (variants.Count, 1);
var variant = variants.First ();
Assert.AreEqual (4, variant.QV);
Assert.AreEqual (2, variant.StartPosition);
Assert.AreEqual (VariantType.INDEL, variant.Type);
var vi = variant as IndelVariant;
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
Assert.AreEqual ('C', vi.HomopolymerBase);
Assert.AreEqual (true, vi.AtEndOfAlignment);
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
Assert.AreEqual (true, vi.InHomopolymer);
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion);
}
[Test]
public static void TestExceptionThrownForUnclippedAlignment() {
var refseq = "ACAATATA";
var queryseq = "ACAATAT-";
var r = new Sequence (DnaAlphabet.Instance, refseq);
var q = new Sequence (DnaAlphabet.Instance, queryseq);
var aln = new PairwiseSequenceAlignment (r, q);
var pas = new PairwiseAlignedSequence ();
pas.FirstSequence = r;
pas.SecondSequence = q;
aln.Add (pas);
Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true));
refseq = "AAACAATATA";
queryseq = "AA-CAATATA";
r = new Sequence (DnaAlphabet.Instance, refseq);
q = new Sequence (DnaAlphabet.Instance, queryseq);
aln = new PairwiseSequenceAlignment (r, q);
pas = new PairwiseAlignedSequence ();
pas.FirstSequence = r;
pas.SecondSequence = q;
aln.Add (pas);
Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true));
}
}
}