зеркало из https://github.com/dotnetbio/bio.git
Merge pull request #2 from dotnetbio/variantcaller
Add Bio.Variant Namespace
This commit is contained in:
Коммит
4f207b370a
|
@ -433,6 +433,12 @@
|
|||
<Compile Include="WordMatch.cs" />
|
||||
<Compile Include="IO\PacBio\PacBioCCSRead.cs" />
|
||||
<Compile Include="IO\PacBio\PacBioCCSBamReader.cs" />
|
||||
<Compile Include="Variant\AlignmentUtils.cs" />
|
||||
<Compile Include="Variant\BPandQV.cs" />
|
||||
<Compile Include="Variant\IndelVariant.cs" />
|
||||
<Compile Include="Variant\SNPVariant.cs" />
|
||||
<Compile Include="Variant\Variant.cs" />
|
||||
<Compile Include="Variant\VariantCaller.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<EmbeddedResource Include="CloneLibrary\Resources\Library.txt" />
|
||||
|
@ -478,5 +484,6 @@ cp "$(TargetDir)\$(TargetName).xml" "$(SolutionDir)\bin\$(TargetName).xml"</Post
|
|||
-->
|
||||
<ItemGroup>
|
||||
<Folder Include="IO\PacBio\" />
|
||||
<Folder Include="Variant\" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,265 @@
|
|||
using System;
|
||||
using Bio.Algorithms.Alignment;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Bio.Extensions;
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace Bio.Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// Utilities to manipulate and work with alignments when calling variants.
|
||||
/// </summary>
|
||||
public static class AlignmentUtils
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// The maximum number of times we can shift an indel before we fail.
|
||||
/// It should never go anywhere near this number and this is only implemented as a guard.
|
||||
/// </summary>
|
||||
private const int MAX_LOOPS = 50;
|
||||
private const string MAX_LOOPS_ERROR = "The Left-align indel step did not converge within 50 iterations";
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the alignment has no leading or trailing gaps and throws an exception otherwise.
|
||||
/// </summary>
|
||||
/// <param name="refseq">Refseq.</param>
|
||||
/// <param name="query">Query.</param>
|
||||
internal static void VerifyNoGapsOnEnds(byte[] refseq, BPandQV[] query) {
|
||||
var gap = DnaAlphabet.Instance.Gap;
|
||||
if (refseq [0] == gap ||
|
||||
refseq [refseq.Length - 1] == gap ||
|
||||
query [0].BP == gap ||
|
||||
query [query.Length - 1].BP == gap) {
|
||||
throw new FormatException ("Alignment query and/or reference started with a gap character. " +
|
||||
"Alignments must be hard-clipped to remove starting and trailing variants " +
|
||||
"before variants can be called");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given two byte arrays representing a pairwise alignment, shift them so
|
||||
/// that all deletions start as early as possible. For example:
|
||||
/// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT
|
||||
/// TTTTAA--TTTT TTTT--AATTTT
|
||||
///
|
||||
/// This modifies the array in place.
|
||||
/// </summary>
|
||||
/// <param name="refseq">Reference Sequency</param>
|
||||
/// <param name="query">Query Sequence</param>
|
||||
/// <returns></returns>
|
||||
public static void LeftAlignIndels(byte[] refseq, BPandQV[] query)
|
||||
{
|
||||
// Validation
|
||||
if (refseq.Length != query.Length) {
|
||||
throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences");
|
||||
}
|
||||
|
||||
ValidateNoOverlappingGaps (refseq, query);
|
||||
byte gap = DnaAlphabet.Instance.Gap;
|
||||
// Keep left aligning until we can't anymore, this is a
|
||||
// do while loop because some downstream left alignments open up
|
||||
// further ones upstream, even though this is rare.
|
||||
int change_count = 0;
|
||||
int loopsThrough = 0;
|
||||
do
|
||||
{
|
||||
loopsThrough++;
|
||||
change_count = 0;
|
||||
for (int i = 1; i < refseq.Length; i++)
|
||||
{
|
||||
if (refseq[i] == gap)
|
||||
{
|
||||
int len = GetGapLength(i, refseq);
|
||||
int left_side = i - 1;
|
||||
int right_side = i - 1 + len;
|
||||
while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP))
|
||||
{
|
||||
// Move the gap left.
|
||||
if (right_side < refseq.Length) {
|
||||
refseq[right_side] = refseq[left_side];
|
||||
}
|
||||
refseq[left_side] = gap;
|
||||
left_side--;
|
||||
right_side--;
|
||||
change_count++;
|
||||
}
|
||||
if (loopsThrough > MAX_LOOPS) {
|
||||
throw new Exception(MAX_LOOPS_ERROR);
|
||||
}
|
||||
}
|
||||
else if (query[i].BP == gap)
|
||||
{
|
||||
int len = GetGapLength(i, query);
|
||||
int left_side = i - 1;
|
||||
int right_side = i - 1 + len;
|
||||
while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side]))
|
||||
{
|
||||
// Move the gap left.
|
||||
if (right_side < query.Length) {
|
||||
query[right_side] = query[left_side];
|
||||
}
|
||||
query[left_side] = new BPandQV(gap, 0);
|
||||
left_side--;
|
||||
right_side--;
|
||||
change_count++;
|
||||
}
|
||||
if (loopsThrough > MAX_LOOPS) {
|
||||
throw new Exception(MAX_LOOPS_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (change_count > 0);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given the start position of a gap, returns how long it is.
|
||||
/// For example:
|
||||
///
|
||||
/// AAAA---TTTT returns 3.
|
||||
/// </summary>
|
||||
/// <param name="pos">0 indexed</param>
|
||||
/// <param name="array"></param>
|
||||
/// <returns></returns>
|
||||
public static int GetGapLength(int pos, BPandQV[] array)
|
||||
{
|
||||
var gap = DnaAlphabet.Instance.Gap;
|
||||
int len = 1;
|
||||
while (++pos < array.Length)
|
||||
{
|
||||
if (array[pos].BP == gap)
|
||||
{
|
||||
len += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given the start position of a gap, returns how long it is.
|
||||
/// For example:
|
||||
///
|
||||
/// AAAA---TTTT returns 3.
|
||||
/// </summary>
|
||||
/// <param name="pos">0 indexed</param>
|
||||
/// <param name="array"></param>
|
||||
/// <returns></returns>
|
||||
public static int GetGapLength(int pos, byte[] array)
|
||||
{
|
||||
var gap = DnaAlphabet.Instance.Gap;
|
||||
int len = 1;
|
||||
while (++pos < array.Length)
|
||||
{
|
||||
if (array[pos] == gap)
|
||||
{
|
||||
len += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Simple check that the alignment does not have a gap on top of a
|
||||
/// gap, which violates several assumptions.
|
||||
/// </summary>
|
||||
/// <param name="seq1"></param>
|
||||
/// <param name="seq2"></param>
|
||||
internal static void ValidateNoOverlappingGaps(byte[] seq1, BPandQV[] seq2)
|
||||
{
|
||||
var gap = DnaAlphabet.Instance.Gap;
|
||||
for(int i=0;i<seq1.Length;i++)
|
||||
{
|
||||
if (seq1[i] == gap && seq2[i].BP == gap)
|
||||
throw new Exception("You have an alignment with overlapping gaps. Input problem!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Reverse complements the BP and QV values accounting for homopolymers.
|
||||
///
|
||||
/// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
|
||||
/// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
|
||||
/// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the
|
||||
/// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
|
||||
/// </summary>
|
||||
/// <returns>The reverse complemented sequence.</returns>
|
||||
/// <param name="toFlip">The array with the QV values to flip.</param>
|
||||
/// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param>
|
||||
internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false)
|
||||
{
|
||||
BPandQV[] newData = new BPandQV[toFlip.Length];
|
||||
|
||||
for (long index = 0; index < toFlip.Length; index++)
|
||||
{
|
||||
byte complementedSymbol;
|
||||
byte symbol = toFlip[toFlip.Length - index - 1].BP;
|
||||
|
||||
if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol))
|
||||
{
|
||||
throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString());
|
||||
}
|
||||
var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index -1].QV);
|
||||
newData [index] = bpandq;
|
||||
}
|
||||
|
||||
if (flipHpQvValues) {
|
||||
ReverseQVValuesForHomopolymers (newData);
|
||||
}
|
||||
return newData;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reverses the QV values for homopolymers.
|
||||
/// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
|
||||
/// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
|
||||
/// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the
|
||||
/// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
|
||||
/// </summary>
|
||||
/// <returns>The QV values for homopolymers.</returns>
|
||||
/// <param name="toFlip">To flip.</param>
|
||||
internal static void ReverseQVValuesForHomopolymers(BPandQV[] toFlip) {
|
||||
// Basic idea is to assume it is A, C, G, T alphabet and flip HP values
|
||||
// Also assumes all low QV is due to HP deletion/insertion error.
|
||||
if (toFlip.Length > 1) {
|
||||
byte lastbp = toFlip [0].BP;
|
||||
int firstPos = 0;
|
||||
int curLength = 1;
|
||||
for (int i = 1; i < toFlip.Length; i++) {
|
||||
byte newbp = toFlip [i].BP;
|
||||
if (newbp != lastbp) {
|
||||
if (curLength > 1) {
|
||||
var right = toFlip [i - 1];
|
||||
var left = toFlip[firstPos];
|
||||
Debug.Assert (right.BP == left.BP);
|
||||
if (right.QV < left.QV) {
|
||||
toFlip [i - 1] = left;
|
||||
toFlip [firstPos] = right;
|
||||
}
|
||||
}
|
||||
firstPos = i;
|
||||
lastbp = newbp;
|
||||
curLength = 1;
|
||||
} else if (newbp == lastbp) {
|
||||
curLength++;
|
||||
}
|
||||
}
|
||||
// Finally flip the end
|
||||
if (curLength > 1) {
|
||||
var tmp = toFlip [toFlip.Length - 1];
|
||||
toFlip [toFlip.Length - 1] = toFlip [firstPos];
|
||||
toFlip [firstPos] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
using System;
|
||||
|
||||
namespace Bio.Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// A Basepair and corresponding QV value, used to keep the two values together.
|
||||
/// </summary>
|
||||
public struct BPandQV
|
||||
{
|
||||
/// <summary>
|
||||
/// A, C, G, T or '-'
|
||||
/// </summary>
|
||||
public readonly byte BP;
|
||||
|
||||
/// <summary>
|
||||
/// A QV Value for the base. Should be set to 0 if no information is available
|
||||
/// or is a gap character.
|
||||
/// </summary>
|
||||
public readonly byte QV;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="Bio.Variant.BPandQV"/> struct.
|
||||
/// </summary>
|
||||
/// <param name="bp">Bp.</param>
|
||||
/// <param name="qv">QV value</param>
|
||||
/// <param name="validate">If set to <c>true</c> validate that the BP is an A, C, G or T</param>
|
||||
public BPandQV (byte bp, byte qv, bool validate=true)
|
||||
{
|
||||
var bp2 = (byte)'-';
|
||||
if (validate &&
|
||||
!(bp != 'A' ||
|
||||
bp != 'C' ||
|
||||
bp != 'G' ||
|
||||
bp != 'T' ||
|
||||
bp != '-')) {
|
||||
throw new ArgumentOutOfRangeException ("bp", "Basepairs must be A, C, G, T or -");
|
||||
}
|
||||
BP = bp;
|
||||
QV = qv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Bio;
|
||||
|
||||
namespace Bio.Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// The two types of indels, insertions or deletions.
|
||||
/// </summary>
|
||||
[Flags]
|
||||
public enum IndelType : byte {
|
||||
/// <summary>
|
||||
/// An insertion relative to the reference.
|
||||
/// </summary>
|
||||
Insertion,
|
||||
/// <summary>
|
||||
/// A deletion relative to the reference.
|
||||
/// </summary>
|
||||
Deletion}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// An Indel Variant.
|
||||
/// </summary>
|
||||
public class IndelVariant : Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// Insertion or Deletion
|
||||
/// </summary>
|
||||
public readonly IndelType InsertionOrDeletion;
|
||||
|
||||
/// <summary>
|
||||
/// These are the bases in the insertion (or what was deleted).
|
||||
/// </summary>
|
||||
public readonly string InsertedOrDeletedBases;
|
||||
|
||||
/// <summary>
|
||||
/// True if the homopolymer length of the reference is > 1.
|
||||
/// <code>
|
||||
/// e.g. ACC -> TRUE
|
||||
/// A-C
|
||||
/// </code>
|
||||
/// </summary>
|
||||
/// <value><c>true</c> if in homo polymer; otherwise, <c>false</c>.</value>
|
||||
public bool InHomopolymer {
|
||||
get {
|
||||
|
||||
return HomopolymerLengthInReference > 1;
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// At the position of the insertion or deletion, what is the length of the nearest homopolymer run in the reference.
|
||||
/// For insertions, the HP length is defined by the next base in the reference.
|
||||
///
|
||||
/// <code>
|
||||
/// For example, the insertion below occurs at an HP of char 'C' and length 3.
|
||||
///
|
||||
/// ref: AT-CCCTG
|
||||
/// query: ATCCCCTG
|
||||
///
|
||||
/// For multiple insertions or a long deletion, only the HP of the first position is counted,
|
||||
/// so the example below would be a 2 bp long 'C'
|
||||
///
|
||||
///
|
||||
/// ref: ATCCTGCATA
|
||||
/// query: AT-----ATA
|
||||
///
|
||||
///
|
||||
/// </code>
|
||||
///
|
||||
public int HomopolymerLengthInReference {
|
||||
get;
|
||||
set;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A, C, G or T the basepair of the homopolymer.
|
||||
/// </summary>
|
||||
/// <value>The homopolymer base.</value>
|
||||
public char HomopolymerBase {
|
||||
get;
|
||||
set;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="Bio.Variant.IndelVariant"/> class.
|
||||
/// </summary>
|
||||
/// <param name="position">Position.</param>
|
||||
/// <param name="length">Length.</param>
|
||||
/// <param name="bases">Bases.</param>
|
||||
/// <param name="insertionOrDeletion">Insertion or deletion.</param>
|
||||
/// <param name="hpBase">Hp base.</param>
|
||||
/// <param name="hpLengthInReference">Hp length in reference.</param>
|
||||
/// <param name="atAlignmentEnd">If set to <c>true</c> at alignment end.</param>
|
||||
public IndelVariant(int position, int length, string bases, IndelType insertionOrDeletion, char hpBase, int hpLengthInReference, bool atAlignmentEnd = false)
|
||||
: base (position, atAlignmentEnd)
|
||||
{
|
||||
Type = VariantType.INDEL;
|
||||
InsertionOrDeletion = insertionOrDeletion;
|
||||
Length = length;
|
||||
InsertedOrDeletedBases = bases;
|
||||
HomopolymerBase = hpBase;
|
||||
HomopolymerLengthInReference = hpLengthInReference;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a <see cref="System.String"/> that represents the current <see cref="Bio.Variant.IndelVariant"/>.
|
||||
/// </summary>
|
||||
/// <returns>A <see cref="System.String"/> that represents the current <see cref="Bio.Variant.IndelVariant"/>.</returns>
|
||||
public override string ToString ()
|
||||
{
|
||||
var insert = this.InsertionOrDeletion == IndelType.Deletion ? "Deletion" : "Insertion";
|
||||
return string.Format ("[IndelVariant: InHomopolymer={0}, HomopolymerLengthInReference={1}, HomopolymerBase={2}, Type={3}]", InHomopolymer, HomopolymerLengthInReference, HomopolymerBase, insert);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Diagnostics;
|
||||
using Bio;
|
||||
|
||||
|
||||
|
||||
namespace Bio.Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// A class to store information about a SNP variant.
|
||||
/// </summary>
|
||||
public class SNPVariant : Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// The BP in the reference at this position.
|
||||
/// </summary>
|
||||
public readonly char RefBP;
|
||||
|
||||
/// <summary>
|
||||
/// The variant present at this position.
|
||||
/// </summary>
|
||||
public readonly char AltBP;
|
||||
|
||||
/// <summary>
|
||||
/// Create a new SNP in the given reference at the given position.
|
||||
/// </summary>
|
||||
/// <param name="position">0-based position on reference.</param>
|
||||
/// <param name="altAllele">The variant present (A, C, G, T)</param>
|
||||
/// <param name="refBP">The Reference sequence.</param>
|
||||
public SNPVariant(int position, char altAllele, char refBP) :
|
||||
base (position)
|
||||
{
|
||||
AltBP = altAllele;
|
||||
Type = VariantType.SNP;
|
||||
RefBP = (char) refBP;
|
||||
Length = 1;
|
||||
}
|
||||
/// <summary>
|
||||
/// Returns a <see cref="System.String"/> that represents the current <see cref="Bio.Variant.SNPVariant"/>.
|
||||
/// </summary>
|
||||
/// <returns>A <see cref="System.String"/> that represents the current <see cref="Bio.Variant.SNPVariant"/>.</returns>
|
||||
public override string ToString ()
|
||||
{
|
||||
return RefBP + "->" + AltBP + " @ " + StartPosition;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Bio;
|
||||
|
||||
namespace Bio.Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// Top level class for holding variant information.
|
||||
/// This is implemented in two sub classes, SNPVariant and IndelVariant.
|
||||
/// </summary>
|
||||
public abstract class Variant : IComparable<Variant>, IEquatable<Variant>
|
||||
{
|
||||
/// <summary>
|
||||
/// The name of the reference sequence the variant was called from.
|
||||
/// </summary>
|
||||
public string RefName;
|
||||
|
||||
/// <summary>
|
||||
/// SNP, indel, or complex.
|
||||
/// </summary>
|
||||
public VariantType Type { get; protected set; }
|
||||
|
||||
/// <summary>
|
||||
/// 0-based start position of variant.
|
||||
/// <code>
|
||||
/// For Indels, this is the left most position
|
||||
/// BEFORE the event. e.g.
|
||||
/// <code>
|
||||
/// AAATTTAAA -> is a deletion of length 3 starting at position 2.
|
||||
/// AAA---AAA
|
||||
///
|
||||
/// A--TTAAA -> is an insertion of length 2 starting at position 0.
|
||||
/// ACCTTAAA
|
||||
///
|
||||
/// ACCTTAAA -> is a deletion at position 3.
|
||||
/// ACC-TAAA
|
||||
/// </code>
|
||||
///
|
||||
/// </summary>
|
||||
public int StartPosition { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// O-based end index (same as start for SNPs).
|
||||
/// A SNP is of length 1, a deletion of length 2 is 2, etc.
|
||||
/// </summary>
|
||||
public int Length { get; protected set; }
|
||||
|
||||
/// <summary>
|
||||
/// The position in the alignment where this variant ends, exclusive.
|
||||
/// For a SNP, this is the position AFTER the SNP. Same for indels.
|
||||
///
|
||||
/// <code>
|
||||
/// AACCTT -> End Position = 3
|
||||
/// AAGCTT
|
||||
/// </code>
|
||||
///
|
||||
/// </summary>
|
||||
public int EndPosition
|
||||
{
|
||||
get { return StartPosition + Length; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Is the variant at the very start or end of an alignment?
|
||||
/// (that is it was called based on the first or last base seen on
|
||||
/// either sequence in the alignment.)
|
||||
/// These can have certain pathologies so we take note and keep an eye on them.
|
||||
/// They should almost always be excluded by the alignment algorithm clipping at the ends.
|
||||
/// </summary>
|
||||
public bool AtEndOfAlignment { get; protected set; }
|
||||
|
||||
/// <summary>
|
||||
/// The QV value for this call, if it exists. If not, this is set to 0.
|
||||
/// </summary>
|
||||
/// <value>The QV value.</value>
|
||||
public byte QV {get; set;}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="Bio.Variant.Variant"/> class.
|
||||
/// </summary>
|
||||
/// <param name="position">Position.</param>
|
||||
/// <param name="atAlignmentEnd">If set to <c>true</c> at alignment end.</param>
|
||||
public Variant(int position, bool atAlignmentEnd = false)
|
||||
{
|
||||
StartPosition = position;
|
||||
AtEndOfAlignment = atAlignmentEnd;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serves as a hash function for a <see cref="Bio.Variant.Variant"/> object.
|
||||
/// </summary>
|
||||
/// <returns>A hash code for this instance that is suitable for use in hashing algorithms and data structures such as a
|
||||
/// hash table.</returns>
|
||||
public override int GetHashCode ()
|
||||
{
|
||||
//TODO: This is a little unsafe as these fields are mutable...
|
||||
var h1 = RefName != null ? RefName.GetHashCode () : 0;
|
||||
var h2 = StartPosition.GetHashCode ();
|
||||
var h3 = EndPosition.GetHashCode ();
|
||||
return h1 ^ h2 ^ h3;
|
||||
}
|
||||
/// <summary>
|
||||
/// Determines whether the specified <see cref="System.Object"/> is equal to the current <see cref="Bio.Variant.Variant"/>.
|
||||
///
|
||||
/// Two variants are "equal" if they have the same start and end positions and are of the same type.
|
||||
/// </summary>
|
||||
/// <param name="obj">The <see cref="System.Object"/> to compare with the current <see cref="Bio.Variant.Variant"/>.</param>
|
||||
/// <returns><c>true</c> if the specified <see cref="System.Object"/> is equal to the current
|
||||
/// <see cref="Bio.Variant.Variant"/>; otherwise, <c>false</c>.</returns>
|
||||
public override bool Equals (object obj)
|
||||
{
|
||||
var res = obj as Variant;
|
||||
if (res != null)
|
||||
return this.Equals (res);
|
||||
return false;
|
||||
}
|
||||
#region IComparable implementation
|
||||
|
||||
/// <Docs>To be added.</Docs>
|
||||
/// <para>Returns the sort order (Reference Name, Start Position) of the current variant compared to the specified object.</para>
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
/// <returns>The to.</returns>
|
||||
/// <param name="other">Other.</param>
|
||||
public int CompareTo (Variant other)
|
||||
{
|
||||
var nameComp = String.CompareOrdinal(RefName,other.RefName);
|
||||
if (nameComp == 0) {
|
||||
return StartPosition.CompareTo (other.StartPosition);
|
||||
}
|
||||
return nameComp;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region IEquatable implementation
|
||||
/// <summary>
|
||||
/// Determines whether the specified <see cref="Bio.Variant.Variant"/> is equal to the current <see cref="Bio.Variant.Variant"/>.
|
||||
///
|
||||
/// Two variants are equal if they have the same reference, start/end and type.
|
||||
/// </summary>
|
||||
/// <param name="other">The <see cref="Bio.Variant.Variant"/> to compare with the current <see cref="Bio.Variant.Variant"/>.</param>
|
||||
/// <returns><c>true</c> if the specified <see cref="Bio.Variant.Variant"/> is equal to the current
|
||||
/// <see cref="Bio.Variant.Variant"/>; otherwise, <c>false</c>.</returns>
|
||||
bool IEquatable<Variant>.Equals (Variant other)
|
||||
{
|
||||
var otherComp = CompareTo (other);
|
||||
if (otherComp == 0) {
|
||||
return this.StartPosition == other.StartPosition && this.Type == other.Type && this.EndPosition == other.EndPosition;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The type of variant, either SNP, INDEL or Complex.
|
||||
/// </summary>
|
||||
public enum VariantType
|
||||
{
|
||||
/// <summary>
|
||||
/// A SNP variant.
|
||||
/// </summary>
|
||||
SNP,
|
||||
/// <summary>
|
||||
/// And Indel variant.
|
||||
/// </summary>
|
||||
INDEL,
|
||||
/// <summary>
|
||||
/// A Complex variant. Not currently used but a place holder for more complex scenarios in the future.
|
||||
/// </summary>
|
||||
Complex
|
||||
}
|
||||
}
|
|
@ -0,0 +1,271 @@
|
|||
using System;
|
||||
using System.Linq;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
|
||||
using Bio;
|
||||
using Bio.Extensions;
|
||||
using Bio.Algorithms.Alignment;
|
||||
|
||||
namespace Bio.Variant
|
||||
{
|
||||
/// <summary>
|
||||
/// This class takes alignments
|
||||
/// and generates a list of variants.
|
||||
/// </summary>
|
||||
public static class VariantCaller
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// Given two byte arrays representing a pairwise alignment, shift them so
|
||||
/// that all deletions start as early as possible. For example:
|
||||
///
|
||||
/// <code>
|
||||
/// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT
|
||||
/// TTTTAA--TTTT TTTT--AATTTT
|
||||
/// </code>
|
||||
///
|
||||
/// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second
|
||||
/// sequence is the query. It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants.
|
||||
/// </summary>
|
||||
/// <param name="aln">Aln. The second sequence should be of type QualitativeSequence or Sequence</param>
|
||||
/// <param name="callVariants">callVariants. If true, it will call variants, otherwise the second half of tuple will be null. </param>
|
||||
public static Tuple<IPairwiseSequenceAlignment, List<Variant>> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) {
|
||||
|
||||
if (aln == null) {
|
||||
throw new NullReferenceException ("aln");
|
||||
}
|
||||
if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) {
|
||||
throw new ArgumentException ("The pairwise aligned sequence should only have one alignment");
|
||||
}
|
||||
var frstAln = aln.PairwiseAlignedSequences.First ();
|
||||
var seq1 = frstAln.FirstSequence;
|
||||
var seq2 = frstAln.SecondSequence;
|
||||
if (seq1 == null) {
|
||||
throw new NullReferenceException ("seq1");
|
||||
} else if (seq2 == null) {
|
||||
throw new NullReferenceException ("seq2");
|
||||
}
|
||||
|
||||
//TODO: Might implement an ambiguity check later.
|
||||
#if FALSE
|
||||
if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) {
|
||||
throw new ArgumentException ("Cannot left align sequences with ambiguous symbols.");
|
||||
}
|
||||
#endif
|
||||
|
||||
// Note we have to copy unless we can guarantee the array will not be mutated.
|
||||
byte[] refseq = seq1.ToArray ();
|
||||
ISequence newQuery;
|
||||
List<Variant> variants = null;
|
||||
// Call variants for a qualitative sequence
|
||||
if (seq2 is QualitativeSequence) {
|
||||
var qs = seq2 as QualitativeSequence;
|
||||
var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray ();
|
||||
AlignmentUtils.LeftAlignIndels (refseq, query);
|
||||
AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
|
||||
if (callVariants) {
|
||||
variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
|
||||
}
|
||||
var newQueryQS = new QualitativeSequence (qs.Alphabet,
|
||||
qs.FormatType,
|
||||
query.Select (z => z.BP).ToArray (),
|
||||
query.Select (p => p.QV).ToArray (),
|
||||
false);
|
||||
newQueryQS.Metadata = seq2.Metadata;
|
||||
newQuery = newQueryQS;
|
||||
|
||||
} else if (seq2 is Sequence) { // For a sequence with no QV values.
|
||||
var qs = seq2 as Sequence;
|
||||
var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray();
|
||||
AlignmentUtils.LeftAlignIndels (refseq, query);
|
||||
AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
|
||||
// ISequence does not have a setable metadata
|
||||
var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false);
|
||||
newQueryS.Metadata = seq2.Metadata;
|
||||
if (callVariants) {
|
||||
variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
|
||||
}
|
||||
newQuery = newQueryS;
|
||||
} else {
|
||||
throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence.");
|
||||
}
|
||||
|
||||
if (aln.FirstSequence != null && aln.FirstSequence.ID != null) {
|
||||
foreach (var v in variants) {
|
||||
v.RefName = aln.FirstSequence.ID;
|
||||
}
|
||||
}
|
||||
|
||||
var newRef = new Sequence (seq1.Alphabet, refseq, false);
|
||||
newRef.ID = seq1.ID;
|
||||
newRef.Metadata = seq1.Metadata;
|
||||
|
||||
newQuery.ID = seq2.ID;
|
||||
|
||||
var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence);
|
||||
var pas = new PairwiseAlignedSequence ();
|
||||
pas.FirstSequence = newRef;
|
||||
pas.SecondSequence = newQuery;
|
||||
newaln.Add (pas);
|
||||
return new Tuple<IPairwiseSequenceAlignment, List<Variant>> (newaln, variants);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a pairwise sequence alignment, call variants, producing
|
||||
/// a list of SNPs and Indels found in the alignment.
|
||||
///
|
||||
/// This method will first left-align all variants before calling to be consistent with other
|
||||
/// software. The
|
||||
/// </summary>
|
||||
/// <param name="aln">The Pairwise alignment to call variants with.</param>
|
||||
/// <returns></returns>
|
||||
///
|
||||
|
||||
public static List<Variant> CallVariants(IPairwiseSequenceAlignment aln) {
|
||||
return LeftAlignIndelsAndCallVariants (aln).Item2;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calls the variants.
|
||||
///
|
||||
/// Should only be used internally as assumptions are made that the alignments are left-aligned and fulfill certain criteria.
|
||||
/// </summary>
|
||||
/// <returns>The variants.</returns>
|
||||
/// <param name="refSeq">Reference seq.</param>
|
||||
/// <param name="querySeq">Query seq.</param>
|
||||
/// <param name="originallyReverseComplemented">If set to <c>true</c> the query sequence was originally reverse complemented. (this affects QV value scoring)</param>
|
||||
internal static List<Variant> CallVariants(byte[] refSeq, BPandQV[] querySeq, bool originallyReverseComplemented)
|
||||
{
|
||||
if (originallyReverseComplemented) {
|
||||
AlignmentUtils.ReverseQVValuesForHomopolymers (querySeq);
|
||||
}
|
||||
List<Variant> variants = new List<Variant>();
|
||||
|
||||
// Now call variants.
|
||||
var gap = DnaAlphabet.Instance.Gap;
|
||||
int i = 0;
|
||||
int refPos = 0;
|
||||
int queryPos = 0;
|
||||
while( i < refSeq.Length)
|
||||
{
|
||||
if (refSeq[i] == gap)
|
||||
{
|
||||
int len = AlignmentUtils.GetGapLength(i, refSeq);
|
||||
var nextBasePos = (i + len);
|
||||
// Should alway be true as we don't end in gaps
|
||||
Debug.Assert (nextBasePos < refSeq.Length);
|
||||
var hplenAndChar = determineHomoPolymerLength (nextBasePos, refSeq);
|
||||
var bases = getBases(querySeq, i, len);
|
||||
var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Insertion,
|
||||
hplenAndChar.Item2, hplenAndChar.Item1,
|
||||
(i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
|
||||
newVariant.QV = querySeq[queryPos].QV;
|
||||
variants.Add(newVariant);
|
||||
i += len;
|
||||
queryPos += len;
|
||||
}
|
||||
else if (querySeq[i].BP == gap)
|
||||
{
|
||||
int len = AlignmentUtils.GetGapLength(i, querySeq);
|
||||
var bases = getBases(refSeq, i, len);
|
||||
var hplenAndChar = determineHomoPolymerLength (i, refSeq);
|
||||
var newVariant = new IndelVariant(refPos - 1, len, bases,
|
||||
IndelType.Deletion, hplenAndChar.Item2,
|
||||
hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
|
||||
/* An insertion mutation occurs BEFORE pos, so normally we get the next base
|
||||
* or the last one if it's a reverse complemented alignment. However, this is not true if
|
||||
* it is a homopolymer because what would have been the previous position is the next position
|
||||
* after left aligning and reversing the position of the QV value.
|
||||
*
|
||||
* Consider the following
|
||||
* --*- -*--
|
||||
* A-TA --> TA-T
|
||||
* AGTA TACT
|
||||
*
|
||||
* However,
|
||||
* --*-- --*--
|
||||
* A-TTA ----> T-AAT
|
||||
* ATTTA TAAAT
|
||||
*
|
||||
*/
|
||||
if ((i + len ) < querySeq.Length) {
|
||||
|
||||
var qc_pos = originallyReverseComplemented ? i - 1 : i + len;
|
||||
if (newVariant.InHomopolymer) {
|
||||
qc_pos = i + len;
|
||||
}
|
||||
newVariant.QV = querySeq[qc_pos].QV;
|
||||
}
|
||||
variants.Add(newVariant);
|
||||
i += len;
|
||||
refPos += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (querySeq[i].BP != refSeq[i])
|
||||
{
|
||||
var newVariant = new SNPVariant(refPos, (char) querySeq[i].BP, (char)refSeq[i]);
|
||||
newVariant.QV = querySeq [queryPos].QV;
|
||||
variants.Add(newVariant);
|
||||
}
|
||||
i++; refPos++; queryPos++;
|
||||
}
|
||||
}
|
||||
return variants;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a subset of bases in the array into a string.
|
||||
/// </summary>
|
||||
/// <param name="array"></param>
|
||||
/// <param name="position"></param>
|
||||
/// <param name="length"></param>
|
||||
/// <returns></returns>
|
||||
private static string getBases(byte[] array, int position, int length)
|
||||
{
|
||||
char[] chars = new char[length];
|
||||
for(int i=0; i<length; i++)
|
||||
{
|
||||
chars[i] = (char)array[i+position];
|
||||
}
|
||||
return new string(chars);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the bases for a length of the sequence as a string.
|
||||
/// </summary>
|
||||
/// <returns>The bases.</returns>
|
||||
/// <param name="array">Array.</param>
|
||||
/// <param name="position">Position.</param>
|
||||
/// <param name="length">Length.</param>
|
||||
private static string getBases(BPandQV[] array, int position, int length)
|
||||
{
|
||||
char[] chars = new char[length];
|
||||
for(int i=0; i<length; i++)
|
||||
{
|
||||
chars[i] = (char)array[i+position].BP;
|
||||
}
|
||||
return new string(chars);
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Returns the length of the homopolymer
|
||||
/// </summary>
|
||||
/// <returns>The homopolymer length.</returns>
|
||||
private static Tuple<int, char> determineHomoPolymerLength(int pos, byte[] refSeq)
|
||||
{
|
||||
|
||||
byte start_bp = refSeq[pos];
|
||||
int length = 1;
|
||||
while ( ++pos < refSeq.Length &&
|
||||
refSeq [pos] == start_bp) {
|
||||
length++;
|
||||
}
|
||||
var homopolymerBase = (char)start_bp;
|
||||
return new Tuple<int,char> (length, homopolymerBase);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -218,6 +218,7 @@
|
|||
<Compile Include="Algorithms\Padena\PadenaBvtTestCases.cs" />
|
||||
<Compile Include="Algorithms\Padena\PadenaP1TestCases.cs" />
|
||||
<Compile Include="PacBio\ParserTests.cs" />
|
||||
<Compile Include="Variant\VariantCallTests.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Content Include="..\TestData\Testcases.xml">
|
||||
|
@ -269,5 +270,6 @@
|
|||
<ItemGroup />
|
||||
<ItemGroup>
|
||||
<Folder Include="PacBio\" />
|
||||
<Folder Include="Variant\" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,307 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
using Bio;
|
||||
using Bio.Variant;
|
||||
using Bio.Algorithms.Alignment;
|
||||
using Bio.Extensions;
|
||||
using NUnit.Framework;
|
||||
|
||||
namespace Bio.Tests
|
||||
{
|
||||
[TestFixture]
|
||||
[Category("VariantCalls")]
|
||||
public static class VariantCallTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Converts the Sequence to a QualitativeSequence in the alignment.
|
||||
/// </summary>
|
||||
/// <param name="aln">Aln.</param>
|
||||
/// <param name="qualScores">Qual scores.</param>
|
||||
public static void ConvertAlignedSequenceToQualSeq(IPairwiseSequenceAlignment aln, int[] qualScores) {
|
||||
var q = aln.PairwiseAlignedSequences [0].SecondSequence as Sequence;
|
||||
var qvs = new int[q.Count];
|
||||
int queryPos = 0;
|
||||
for (int i = 0; i < qvs.Length; i++) {
|
||||
if (q [i] == '-') {
|
||||
qvs [i] = 0;
|
||||
} else {
|
||||
qvs [i] = qualScores[queryPos++];
|
||||
}
|
||||
}
|
||||
var qseq = new QualitativeSequence (DnaAlphabet.Instance, FastQFormatType.Sanger, q.ToArray (), qvs, false);
|
||||
|
||||
aln.PairwiseAlignedSequences [0].SecondSequence = qseq;
|
||||
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
||||
public static void Test1BPInsertionCall()
|
||||
{
|
||||
string seq1seq = "ATA-CCCTT".Replace("-", String.Empty);
|
||||
string seq2seq = "ATACCCCTT";
|
||||
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30, 30, 30 };
|
||||
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
|
||||
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (refseq, query).First();
|
||||
// Need to add in the QV Values.
|
||||
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (variants.Count, 1);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (4, variant.QV);
|
||||
Assert.AreEqual (2, variant.StartPosition);
|
||||
Assert.AreEqual (VariantType.INDEL, variant.Type);
|
||||
var vi = variant as IndelVariant;
|
||||
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
|
||||
Assert.AreEqual ('C', vi.HomopolymerBase);
|
||||
Assert.AreEqual (3, vi.HomopolymerLengthInReference);
|
||||
Assert.AreEqual (true, vi.InHomopolymer);
|
||||
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public static void Test1BPDeletionCall()
|
||||
{
|
||||
string seq1seq = "ATACCCCTT";
|
||||
string seq2seq = "ATA-CCCTT".Replace("-", String.Empty);
|
||||
int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30 };
|
||||
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
|
||||
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (refseq, query).First();
|
||||
// Need to add in the QV Values.
|
||||
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (variants.Count, 1);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (2, variant.QV);
|
||||
Assert.AreEqual (2, variant.StartPosition);
|
||||
Assert.AreEqual (VariantType.INDEL, variant.Type);
|
||||
var vi = variant as IndelVariant;
|
||||
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
|
||||
Assert.AreEqual ('C', vi.HomopolymerBase);
|
||||
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
|
||||
Assert.AreEqual (true, vi.InHomopolymer);
|
||||
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public static void TestSNPCall()
|
||||
{
|
||||
string seq1seq = "ATCCCCCTT";
|
||||
string seq2seq = "ATCCCTCTT";
|
||||
int[] seq2qual = new int[] { 30, 30, 30, 30, 5, 3, 30, 30, 30 };
|
||||
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
|
||||
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
|
||||
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (refseq, query).First();
|
||||
ConvertAlignedSequenceToQualSeq (aln, seq2qual);
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (variants.Count, 1);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (3, variant.QV);
|
||||
Assert.AreEqual (5, variant.StartPosition);
|
||||
Assert.AreEqual (variant.Type, VariantType.SNP);
|
||||
var vi = variant as SNPVariant;
|
||||
Assert.AreEqual (1, vi.Length);
|
||||
Assert.AreEqual ('T', vi.AltBP);
|
||||
Assert.AreEqual ('C', vi.RefBP);
|
||||
Assert.AreEqual (VariantType.SNP, vi.Type);
|
||||
Assert.AreEqual (false, vi.AtEndOfAlignment);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public static void TestLeftAlignmentStep() {
|
||||
var refseq = "ACAATAAAAGCGCGCGCGCGTTACGTATAT--ATGGATAT";
|
||||
var queryseq = "ACAATAA-AGC--GCGC--GTTACGTATATATATGGATAT";
|
||||
|
||||
var r = new Sequence (DnaAlphabet.Instance, refseq);
|
||||
var q = new Sequence (DnaAlphabet.Instance, queryseq);
|
||||
var aln = new PairwiseSequenceAlignment (r, q);
|
||||
var pas = new PairwiseAlignedSequence ();
|
||||
pas.FirstSequence = r;
|
||||
pas.SecondSequence = q;
|
||||
aln.Add (pas);
|
||||
var tpl = VariantCaller.LeftAlignIndelsAndCallVariants (aln, true);
|
||||
|
||||
// Check the left alignment
|
||||
aln = tpl.Item1 as PairwiseSequenceAlignment;
|
||||
var lar = aln.PairwiseAlignedSequences [0].FirstSequence.ConvertToString();
|
||||
var laq = aln.PairwiseAlignedSequences [0].SecondSequence.ConvertToString();
|
||||
var exprefseq = "ACAATAAAAGCGCGCGCGCGTTACG--TATATATGGATAT";
|
||||
var expqueryseq = "ACAAT-AAA----GCGCGCGTTACGTATATATATGGATAT";
|
||||
Assert.AreEqual (exprefseq, lar);
|
||||
Assert.AreEqual (expqueryseq, laq);
|
||||
|
||||
// And it's hard, so we might as well check the variants
|
||||
var variants = tpl.Item2;
|
||||
Assert.AreEqual (3, variants.Count);
|
||||
string[] bases = new string[] { "A", "GCGC", "TA" };
|
||||
char[] hpbases = new char[] { 'A', 'G', 'T' };
|
||||
bool[] inHp = new bool[] { true, false, false };
|
||||
int[] lengths = new int[] { 1, 4, 2 };
|
||||
int[] starts = new int[] { 4, 8, 24 };
|
||||
IndelType[] types = new IndelType[] { IndelType.Deletion, IndelType.Deletion, IndelType.Insertion };
|
||||
for (int i = 0; i < 3; i++) {
|
||||
Assert.AreEqual (VariantType.INDEL, variants [i].Type);
|
||||
var vi = variants [i] as IndelVariant;
|
||||
Assert.AreEqual (hpbases[i], vi.HomopolymerBase);
|
||||
Assert.AreEqual (starts [i], vi.StartPosition);
|
||||
Assert.AreEqual (lengths [i], vi.Length);
|
||||
Assert.AreEqual (bases [i], vi.InsertedOrDeletedBases);
|
||||
Assert.AreEqual (inHp [i], vi.InHomopolymer);
|
||||
Assert.AreEqual (types [i], vi.InsertionOrDeletion);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
[Test]
|
||||
public static void TestReverseComplement1BPIndelCall() {
|
||||
|
||||
string seq1seq = "ATACCCCTTGCGC";
|
||||
string seq2seq = "ATA-CCCTTGCGC".Replace("-", String.Empty);
|
||||
int[] seq2qual = new int[] { 30, 30, 30, 2, 30, 30, 30, 30, 30, 30, 30, 30 };
|
||||
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
|
||||
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
|
||||
|
||||
var s1rc = refseq.GetReverseComplementedSequence ();
|
||||
var s2rc = query.GetReverseComplementedSequence ();
|
||||
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (s1rc, s2rc).First();
|
||||
VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ());
|
||||
aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement ();
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (variants.Count, 1);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (2, variant.QV);
|
||||
Assert.AreEqual (5, variant.StartPosition);
|
||||
Assert.AreEqual (VariantType.INDEL, variant.Type);
|
||||
var vi = variant as IndelVariant;
|
||||
Assert.AreEqual (IndelType.Deletion, vi.InsertionOrDeletion);
|
||||
Assert.AreEqual ('G', vi.HomopolymerBase);
|
||||
Assert.AreEqual (1, vi.Length);
|
||||
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
|
||||
Assert.AreEqual (true, vi.InHomopolymer);
|
||||
Assert.AreEqual ("G", vi.InsertedOrDeletedBases);
|
||||
Assert.AreEqual (false, vi.AtEndOfAlignment);
|
||||
Assert.AreEqual (6, vi.EndPosition);
|
||||
}
|
||||
|
||||
|
||||
[Test]
|
||||
public static void TestTrickyQVInversions() {
|
||||
// This will be hard because normally flip the QV value for a homopolymer, but in this case we won't.
|
||||
// Note the whole notion of flipping is poorly defined.
|
||||
string seq1seq = "ATTGC";
|
||||
string seq2seq = "ATAGC";
|
||||
int[] seq2qual = new int[] { 30, 30, 2, 30, 30 };
|
||||
var refseq = new Sequence(DnaAlphabet.Instance, seq1seq);
|
||||
var query = new Sequence (DnaAlphabet.Instance, seq2seq);
|
||||
|
||||
var s1rc = refseq.GetReverseComplementedSequence ();
|
||||
var s2rc = query.GetReverseComplementedSequence ();
|
||||
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (s1rc, s2rc).First();
|
||||
VariantCallTests.ConvertAlignedSequenceToQualSeq (aln, seq2qual.Reverse ().ToArray ());
|
||||
aln.PairwiseAlignedSequences [0].Sequences [1].MarkAsReverseComplement ();
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (1, variants.Count);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (VariantType.SNP, variant.Type);
|
||||
Assert.AreEqual (2, variant.QV);
|
||||
|
||||
var vs = variant as SNPVariant;
|
||||
Assert.AreEqual ('T', vs.AltBP);
|
||||
Assert.AreEqual ('A', vs.RefBP);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public static void TestInsertionAtEndofHP() {
|
||||
string seq1seq = "ATA-CCC".Replace("-", String.Empty);
|
||||
string seq2seq = "ATACCCC";
|
||||
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30, 30 };
|
||||
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
|
||||
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (refseq, query).First();
|
||||
// Need to add in the QV Values.
|
||||
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (variants.Count, 1);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (4, variant.QV);
|
||||
Assert.AreEqual (2, variant.StartPosition);
|
||||
Assert.AreEqual (VariantType.INDEL, variant.Type);
|
||||
var vi = variant as IndelVariant;
|
||||
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
|
||||
Assert.AreEqual ('C', vi.HomopolymerBase);
|
||||
Assert.AreEqual (true, vi.AtEndOfAlignment);
|
||||
Assert.AreEqual (3, vi.HomopolymerLengthInReference);
|
||||
Assert.AreEqual (true, vi.InHomopolymer);
|
||||
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Insertion);
|
||||
}
|
||||
|
||||
|
||||
[Test]
|
||||
public static void TestDeletionAtEndofHP() {
|
||||
string seq1seq = "ATACCCC";
|
||||
string seq2seq = "ATA-CCC".Replace("-", String.Empty);
|
||||
int[] seq2qual = new int[] { 30, 30, 30, 4, 30, 30 };
|
||||
var refseq = new Sequence(AmbiguousDnaAlphabet.Instance, seq1seq, false);
|
||||
var query = new Sequence (AmbiguousDnaAlphabet.Instance, seq2seq, false);
|
||||
NeedlemanWunschAligner aligner = new NeedlemanWunschAligner ();
|
||||
var aln = aligner.Align (refseq, query).First();
|
||||
// Need to add in the QV Values.
|
||||
ConvertAlignedSequenceToQualSeq(aln, seq2qual);
|
||||
var variants = VariantCaller.CallVariants (aln);
|
||||
Assert.AreEqual (variants.Count, 1);
|
||||
var variant = variants.First ();
|
||||
Assert.AreEqual (4, variant.QV);
|
||||
Assert.AreEqual (2, variant.StartPosition);
|
||||
Assert.AreEqual (VariantType.INDEL, variant.Type);
|
||||
var vi = variant as IndelVariant;
|
||||
Assert.AreEqual ("C", vi.InsertedOrDeletedBases);
|
||||
Assert.AreEqual ('C', vi.HomopolymerBase);
|
||||
Assert.AreEqual (true, vi.AtEndOfAlignment);
|
||||
Assert.AreEqual (4, vi.HomopolymerLengthInReference);
|
||||
Assert.AreEqual (true, vi.InHomopolymer);
|
||||
Assert.AreEqual (vi.InsertionOrDeletion, IndelType.Deletion);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public static void TestExceptionThrownForUnclippedAlignment() {
|
||||
var refseq = "ACAATATA";
|
||||
var queryseq = "ACAATAT-";
|
||||
|
||||
var r = new Sequence (DnaAlphabet.Instance, refseq);
|
||||
var q = new Sequence (DnaAlphabet.Instance, queryseq);
|
||||
var aln = new PairwiseSequenceAlignment (r, q);
|
||||
var pas = new PairwiseAlignedSequence ();
|
||||
pas.FirstSequence = r;
|
||||
pas.SecondSequence = q;
|
||||
aln.Add (pas);
|
||||
Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true));
|
||||
|
||||
refseq = "AAACAATATA";
|
||||
queryseq = "AA-CAATATA";
|
||||
|
||||
r = new Sequence (DnaAlphabet.Instance, refseq);
|
||||
q = new Sequence (DnaAlphabet.Instance, queryseq);
|
||||
aln = new PairwiseSequenceAlignment (r, q);
|
||||
pas = new PairwiseAlignedSequence ();
|
||||
pas.FirstSequence = r;
|
||||
pas.SecondSequence = q;
|
||||
aln.Add (pas);
|
||||
Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче