From 310ac15d450234b1e25fc3c676e1846dddf53df7 Mon Sep 17 00:00:00 2001
From: Gabe Stocco <98900+gfs@users.noreply.github.com>
Date: Thu, 9 Nov 2023 14:35:16 -0800
Subject: [PATCH] Fixes an issue with setting correct index values  (#567)

* Fixes an issue with setting correct index values when matching a property of an xml tag with an xpath

* Update comments

* Improve robustness of new test cases.

Also check value and index location of matches.
---
 AppInspector.RulesEngine/TextContainer.cs     |  74 +++++++++----
 .../RuleProcessor/XmlAndJsonTests.cs          | 104 +++++++++++++++++-
 2 files changed, 154 insertions(+), 24 deletions(-)
diff --git a/AppInspector.RulesEngine/TextContainer.cs b/AppInspector.RulesEngine/TextContainer.cs
index 1b631e6..a93a361 100644
--- a/AppInspector.RulesEngine/TextContainer.cs
+++ b/AppInspector.RulesEngine/TextContainer.cs
@@ -172,11 +172,13 @@ public class TextContainer
     }
 
     /// <summary>
-    ///     If this file is a JSON, XML or YML file, returns the string contents of the specified path.
+    ///     If this file is XML, attempts to return the the string contents of the specified XPath applied to the file.
     ///     If the path does not exist, or the file is not JSON, XML or YML returns null.
+    ///     Method contains some heuristic behavior and may not cover all cases. 
+    ///     Please report any issues with a sample XML and XPATH to reproduce.
     /// </summary>
-    /// <param name="Path"></param>
-    /// <returns></returns>
+    /// <param name="Path">XPath to query document with</param>
+    /// <returns>Enumeration of string and Boundary tuples for the XPath matches. Boundary locations refer to the locations in the original document on disk.</returns>
     internal IEnumerable<(string, Boundary)> GetStringFromXPath(string Path, Dictionary<string, string> xpathNameSpaces)
     {
         lock (_xpathLock)
@@ -221,26 +223,56 @@ public class TextContainer
                 continue;
             }
 
-            // First we find the name
+            // We have to heuristically calculate the original indexes of the locations in the original document because the internal representation differs
+            // For example it will convert <Tag Prop="Val"/> to <Tag Prop=\"Val\"/>
+
+            // First we find the name, absolute position index
             var nameIndex = FullContent[minIndex..].IndexOf(nodeIter.Current.Name, StringComparison.Ordinal) + minIndex;
-            // Then we grab the index of the end of this tag.
-            // We can't use OuterXML because the parser will inject the namespace if present into the OuterXML so it doesn't match the original text.
-            var endTagIndex = FullContent[nameIndex..].IndexOf('>');
-            // We also look for self-closing tag
-            var selfClosedTag = FullContent[endTagIndex-1] == '/';
-            // If the tag is self closing innerxml will be empty string, so the finding is located at the end of the tag and is empty string
-            // Otherwise the finding is the content of the xml tag
-            var offset = selfClosedTag ? endTagIndex : FullContent[nameIndex..].IndexOf(nodeIter.Current.InnerXml, StringComparison.Ordinal) + nameIndex;
-            // Move the minimum index up in case there are multiple instances of identical OuterXML
-            // This ensures we won't re-find the same one
-            var totalOffset = minIndex + nameIndex + endTagIndex;
-            minIndex = totalOffset;
-            var location = new Boundary
+            // Then we calculate the absolute index of the end of the tag.
+            // We can't use OuterXML property because the parser will inject the namespace if present into the OuterXML so it doesn't match the original text.
+            var endTagIndex = FullContent[nameIndex..].IndexOf('>', StringComparison.Ordinal) + nameIndex;
+            // If we are matching a tag itself, the previous char should be the open tag
+            //  |
+            //  v
+            // <Tag>
+            // If its a property it won't be
+            //      |
+            //      v
+            // <Tag Prop="Value">
+            var isProp = FullContent[(nameIndex - 1)] != '<';
+            // Check for self-closing tag
+            var selfClosedTag = FullContent[endTagIndex - 1] == '/';
+
+            // This is for when we're capturing the value of a property of the tag rather than the tag itself
+            if (isProp)
             {
-                Index = offset,
-                Length = nodeIter.Current.InnerXml.Length
-            };
-            yield return (nodeIter.Current.Value, location);
+                // Find the index of character after the next end tag index after the name
+                var nextClosingIndexAfterName = endTagIndex+1;
+                // If we have a self closing tag, we can use that index, otherwise we need the closure of this tag
+                var offset = selfClosedTag ? endTagIndex : FullContent[nextClosingIndexAfterName..].IndexOf('>') + nextClosingIndexAfterName + 1;
+                // Move the minimum index up to the end of the closing tag to avoid additioanl matches of the same values
+                minIndex = selfClosedTag ? offset : FullContent[offset..].IndexOf('>') + offset + 1;
+                var location = new Boundary
+                {
+                    // +2 for the \" before the value for the property
+                    Index = nameIndex + nodeIter.Current.Name.Length + 2,
+                    Length = nodeIter.Current.InnerXml.Length
+                };
+                yield return (nodeIter.Current.Value, location);
+            }
+            else
+            {
+                // Move the offset to the end of the opening tag
+                var offset = selfClosedTag ? endTagIndex : FullContent[nameIndex..].IndexOf(nodeIter.Current.InnerXml, StringComparison.Ordinal) + nameIndex;
+                // Move the minimum index up to the end of the closing tag
+                minIndex = selfClosedTag ? offset : FullContent[offset..].IndexOf('>') + offset + 1;
+                var location = new Boundary
+                {
+                    Index = offset,
+                    Length = nodeIter.Current.InnerXml.Length
+                };
+                yield return (nodeIter.Current.Value, location);
+            }
         }
     }
 
diff --git a/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs b/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs
index d942cf0..29725d9 100644
--- a/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs
+++ b/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs
@@ -83,6 +83,82 @@ public class XmlAndJsonTests
     }
 ]";
 
+    private const string xmlStringRuleForPropWithData = @"[
+    {
+        ""id"": ""SA000005"",
+        ""name"": ""Testing.Rules.XML"",
+        ""tags"": [
+            ""Testing.Rules.XML""
+        ],
+        ""severity"": ""Critical"",
+        ""description"": ""This rule checks the value of the property property to be true"",
+        ""patterns"": [
+            {
+                ""pattern"": ""true"",
+                ""type"": ""string"",
+                ""confidence"": ""High"",
+                ""scopes"": [
+                    ""code""
+                ],
+                ""xpaths"" : [""/bookstore/book/title/@*[name()='property']""]
+            }
+        ],
+        ""_comment"": """"
+    }
+]";
+
+    private const string xmlStringRuleForPropWithDataForData = @"[
+    {
+        ""id"": ""SA000005"",
+        ""name"": ""Testing.Rules.XML"",
+        ""tags"": [
+            ""Testing.Rules.XML""
+        ],
+        ""severity"": ""Critical"",
+        ""description"": ""This rule checks the value of the title tag when it has a property"",
+        ""patterns"": [
+            {
+                ""pattern"": ""Franklin"",
+                ""type"": ""regex"",
+                ""confidence"": ""High"",
+                ""scopes"": [
+                    ""code""
+                ],
+                ""xpaths"" : [""/bookstore/book/title""]
+            }
+        ],
+        ""_comment"": """"
+    }
+]";
+
+    private const string xmlDataPropsWithTagValue =
+        @"<?xml version=""1.0"" encoding=""utf-8"" ?>   
+  <bookstore>  
+      <book genre=""autobiography"" publicationdate=""1981-03-22"" ISBN=""1-861003-11-0"">  
+          <title property=""true"">The Autobiography of Benjamin Franklin</title>  
+          <author>  
+              <first-name>Benjamin</first-name>  
+              <last-name>Franklin</last-name>  
+          </author>  
+          <price>8.99</price>  
+      </book>  
+      <book genre=""novel"" publicationdate=""1967-11-17"" ISBN=""0-201-63361-2"">  
+          <title property=""false"">The Confidence Man</title>  
+          <author>  
+              <first-name>Herman</first-name>  
+              <last-name>Melville</last-name>  
+          </author>  
+          <price>11.99</price>  
+      </book>  
+      <book genre=""philosophy"" publicationdate=""1991-02-15"" ISBN=""1-861001-57-6"">  
+          <title property=""false"">The Gorgias</title>  
+          <author>  
+              <name>Plato</name>  
+          </author>  
+          <price>9.99</price>  
+      </book>  
+  </bookstore>";
+
     private const string jsonData =
         @"{
     ""books"":
@@ -228,14 +304,14 @@ public class XmlAndJsonTests
             {
                 ""xpaths"": [""system.web/trace/@enabled""],
                 ""pattern"": ""true"",
-                ""type"": ""regex""
+                ""type"": ""string""
             }
         ],
         ""must-match"": [
-            ""<system.web>\n<trace enabled='true' pageOutput='false' requestLimit='40' localOnly='false' />\n</system.web>""
+            ""<system.web>\n<trace enabled='true' pageOutput='false' requestLimit='40' localOnly='true' />\n</system.web>""
         ],
         ""must-not-match"": [
-            ""<system.web>\n<trace enabled='true' pageOutput='false' requestLimit='40' localOnly='true' />\n</system.web>""
+            ""<system.web>\n<trace enabled='false' pageOutput='false' requestLimit='40' localOnly='true' />\n</system.web>""
         ]
     }]";
         RuleSet rules = new();
@@ -268,6 +344,28 @@ public class XmlAndJsonTests
             Assert.Fail();
         }
     }
+    [DataRow(xmlStringRuleForPropWithDataForData, "Franklin", 212)]
+    [DataRow(xmlStringRuleForPropWithData, "true", 176)]
+    [DataTestMethod]
+    public void XmlTagWithPropsAndValue(string rule, string expectedValue, int expectedIndex)
+    {
+        RuleSet rules = new();
+        rules.AddString(rule, "XmlTestRules");
+        Microsoft.ApplicationInspector.RulesEngine.RuleProcessor processor = new(rules,
+            new RuleProcessorOptions { AllowAllTagsInBuildFiles = true });
+        if (_languages.FromFileNameOut("test.xml", out var info))
+        {
+            var matches = processor.AnalyzeFile(xmlDataPropsWithTagValue, new FileEntry("test.xml", new MemoryStream()), info);
+            Assert.AreEqual(1, matches.Count);
+            var match = matches[0];
+            Assert.AreEqual(expectedValue, match.Sample);
+            Assert.AreEqual(expectedIndex, match.Boundary.Index);
+        }
+        else
+        {
+            Assert.Fail();
+        }
+    }
 
     [DataRow(xmlStringRule)]
     [DataRow(jsonAndXmlStringRule)]