#88 Add API for `getDocCommentText`, `getFirst*Node`

- reuse PhpTokenizer rather than constructing a separate regex because we'll need this sort of context-specific re-tokenizing functionality for incremental parsing anyways. - functional, but still a work-in-progress, just like the rest of the API :) Also moved hanspun lexer to experiments folder
2017-02-07 15:44:21 -08:00 · 2017-02-07 15:44:21 -08:00 · 7a58ceaeb8
--- a/experiments/Lexer.php
+++ b/experiments/Lexer.php
--- a/src/Node.php
+++ b/src/Node.php
@ -77,6 +77,42 @@ class Node implements \JsonSerializable {
        return null;
    }

+    /**
+     * Get's first child that is an instance of one of the provided classes.
+     * Returns null if there is no match.
+     *
+     * @param array ...$classNames
+     * @return Node|null
+     */
+    public function getFirstChildNode(...$classNames) {
+        foreach ($this->getChildNodes() as $child) {
+            foreach ($classNames as $className) {
+                if ($child instanceof $className) {
+                    return $child;
+                }
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Get's first descendant node that is an instance of one of the provided classes.
+     * Returns null if there is no match.
+     *
+     * @param array ...$classNames
+     * @return Node|null
+     */
+    public function getFirstDescendantNode(...$classNames) {
+        foreach ($this->getDescendantNodes() as $descendant) {
+            foreach ($classNames as $className) {
+                if ($descendant instanceof $className) {
+                    return $descendant;
+                }
+            }
+        }
+        return null;
+    }
+
    /**
     * Gets root of the syntax tree (returns self if has no parents)
     * @return Node
@ -349,6 +385,7 @@ class Node implements \JsonSerializable {
    }

    public function & getFileContents() : string {
+        // TODO consider renaming to getSourceText
        return $this->getRoot()->fileContents;
    }

@ -369,6 +406,20 @@ class Node implements \JsonSerializable {
        return null;
    }

+    public function getDocCommentText() {
+        $leadingTriviaText = $this->getLeadingCommentAndWhitespaceText();
+        $leadingTriviaTokens = PhpTokenizer::getTokensArrayFromContent(
+            $leadingTriviaText, ParseContext::SourceElements, $this->getFullStart(), false
+        );
+        for ($i = \count($leadingTriviaTokens) - 1; $i >= 0; $i--) {
+            $token = $leadingTriviaTokens[$i];
+            if ($token->kind === TokenKind::DocCommentToken) {
+                return $token->getText($this->getFileContents());
+            }
+        }
+        return null;
+    }
+
    public function __toString() {
        return $this->getText();
    }
--- a/src/ParseContext.php
+++ b/src/ParseContext.php
@ -0,0 +1,23 @@
+<?php
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+
+namespace Microsoft\PhpParser;
+
+class ParseContext {
+    const SourceElements = 0;
+    const BlockStatements = 1;
+    const ClassMembers = 2;
+    const IfClause2Elements = 3;
+    const SwitchStatementElements = 4;
+    const CaseStatementElements = 5;
+    const WhileStatementElements = 6;
+    const ForStatementElements = 7;
+    const ForeachStatementElements = 8;
+    const DeclareStatementElements = 9;
+    const InterfaceMembers = 10;
+    const TraitMembers = 11;
+    const Count = 12;
+}
--- a/src/Parser.php
+++ b/src/Parser.php
@ -2927,20 +2927,4 @@ class Associativity {
    const None = 0;
    const Left = 1;
    const Right = 2;
-}
-
-class ParseContext {
-    const SourceElements = 0;
-    const BlockStatements = 1;
-    const ClassMembers = 2;
-    const IfClause2Elements = 3;
-    const SwitchStatementElements = 4;
-    const CaseStatementElements = 5;
-    const WhileStatementElements = 6;
-    const ForStatementElements = 7;
-    const ForeachStatementElements = 8;
-    const DeclareStatementElements = 9;
-    const InterfaceMembers = 10;
-    const TraitMembers = 11;
-    const Count = 12;
-}
+}
--- a/src/PhpTokenizer.php
+++ b/src/PhpTokenizer.php
@ -6,18 +6,24 @@

 namespace Microsoft\PhpParser;

+/**
+ * Tokenizes content using PHP's built-in `tokens_get_all`, and converts to "lightweight" Token representation.
+ *
+ * Initially we tried hand-spinning the lexer (see `experiments/Lexer.php`), but we had difficulties optimizing
+ * performance (especially when working with Unicode characters.)
+ *
+ * Class PhpTokenizer
+ * @package Microsoft\PhpParser
+ */
 class PhpTokenizer implements ITokenStreamProvider {
    public $pos;
    public $endOfFilePos;
-    private $token;
-
-    public $inScriptSection = false;

    private $tokensArray;

    public function __construct($content) {
-        $tokens = \token_get_all($content);
-        $this->initialize($tokens);
+        $this->tokensArray = $this->getTokensArrayFromContent($content);
+        $this->endOfFilePos = \count($this->tokensArray) - 1;
        $this->pos = 0;
    }

@ -43,11 +49,19 @@ class PhpTokenizer implements ITokenStreamProvider {
        return $this->tokensArray;
    }

-    private function initialize($tokens) {
+    public static function getTokensArrayFromContent(
+        $content, $parseContext = null, $initialPos = 0, $treatCommentsAsTrivia = true
+    ) : array {
+        if ($parseContext !== null) {
+            $prefix = self::PARSE_CONTEXT_TO_PREFIX[$parseContext];
+            $content = $prefix . $content;
+            $passedPrefix = false;
+        }
+
+        $tokens = \token_get_all($content);
+
        $arr = array();
-        $fullStart = 0;
-        $start = 0;
-        $pos = 0;
+        $fullStart = $start = $pos = $initialPos;

        foreach ($tokens as $token) {
            if (\is_array($token)) {
@ -60,6 +74,14 @@ class PhpTokenizer implements ITokenStreamProvider {

            $pos += $strlen;

+            if ($parseContext !== null && !$passedPrefix) {
+                $passedPrefix = \count($prefix) < $pos;
+                if ($passedPrefix) {
+                    $fullStart = $start = $pos = $initialPos;
+                }
+                continue;
+            }
+
            switch ($tokenKind) {
                case T_OPEN_TAG:
                    $arr[] = new Token(TokenKind::ScriptSectionStartTag, $fullStart, $start, $pos-$fullStart);
@ -67,8 +89,6 @@ class PhpTokenizer implements ITokenStreamProvider {
                    continue;

                case T_WHITESPACE:
-                case T_COMMENT:
-                case T_DOC_COMMENT:
                    $start += $strlen;
                    continue;

@ -82,6 +102,11 @@ class PhpTokenizer implements ITokenStreamProvider {
                    }

                default:
+                    if (($tokenKind === T_COMMENT || $tokenKind === T_DOC_COMMENT) && $treatCommentsAsTrivia) {
+                        $start += $strlen;
+                        continue;
+                    }
+
                    $newTokenKind = isset(self::TOKEN_MAP[$tokenKind])
                        ? self::TOKEN_MAP[$tokenKind]
                        : $newTokenKind = TokenKind::Unknown;
@ -92,8 +117,7 @@ class PhpTokenizer implements ITokenStreamProvider {
        }

        $arr[] = new Token(TokenKind::EndOfFileToken, $fullStart, $start, $pos - $fullStart);
-        $this->tokensArray = $arr;
-        $this->endOfFilePos = \count($arr) - 1;
+        return $arr;
    }

    const TOKEN_MAP = [
@ -267,6 +291,12 @@ class PhpTokenizer implements ITokenStreamProvider {
        T_UNSET_CAST        => TokenKind::UnsetCastToken,
        T_START_HEREDOC     => TokenKind::HeredocStart,
        T_END_HEREDOC       => TokenKind::HeredocEnd,
-        T_STRING_VARNAME    => TokenKind::VariableName
+        T_STRING_VARNAME    => TokenKind::VariableName,
+        T_COMMENT           => TokenKind::CommentToken,
+        T_DOC_COMMENT       => TokenKind::DocCommentToken
+    ];
+
+    const PARSE_CONTEXT_TO_PREFIX = [
+        ParseContext::SourceElements => "<?php "
    ];
 }
--- a/src/TokenKind.php
+++ b/src/TokenKind.php
@ -212,6 +212,8 @@ class TokenKind {
    const BoolCastToken = 414;
    const ArrayCastToken = 415;
    const IntegerLiteralToken = 416;
+    const CommentToken = 417;
+    const DocCommentToken = 418;

    // TODO type annotations - PHP7
 }
--- a/tests/NodeApiTest.php
+++ b/tests/NodeApiTest.php
@ -4,18 +4,12 @@
 *  Licensed under the MIT License. See License.txt in the project root for license information.
 *--------------------------------------------------------------------------------------------*/

-// TODO autoload classes
-require_once(__DIR__ . "/../src/TokenStreamProviderFactory.php");
-require_once(__DIR__ . "/../src/Parser.php");
-require_once(__DIR__ . "/../src/Token.php");
-
-use Microsoft\PhpParser\Node;
 use Microsoft\PhpParser\Node\SourceFileNode;
+use Microsoft\PhpParser\Node\Statement\FunctionDeclaration;
 use Microsoft\PhpParser\Node\Statement\IfStatementNode;
 use Microsoft\PhpParser\Node\Statement\NamespaceDefinition;
 use Microsoft\PhpParser\Parser;
 use PHPUnit\Framework\TestCase;
-use Microsoft\PhpParser\TokenKind;

 class NodeApiTest extends TestCase {
    const FILENAME_PATTERN = __dir__ . "/cases/{parser,}/*.php";
@ -38,7 +32,7 @@ PHP;

    public function testSourceFileNodePosition() {
        $node = self::$sourceFileNode;
-        $this->assertInstanceOf(\Microsoft\PhpParser\Node\Statement\FunctionDeclaration::class, $node->getDescendantNodeAtPosition(15));
+        $this->assertInstanceOf(FunctionDeclaration::class, $node->getDescendantNodeAtPosition(15));
        $this->assertInstanceOf(\Microsoft\PhpParser\Node\Expression\Variable::class, $node->getDescendantNodeAtPosition(28));
    }

@ -135,4 +129,65 @@ PHP;
            "getFirstAncestor with no specified class names should return null."
        );
    }
+
+    public function testGetDocCommentText() {
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php /** */ function b () { }",
+            "/** */"
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php /***/ function b () { }",
+            null
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php /*/** */ function b () { }",
+            null
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php /**d */ function b () { }",
+            null
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php /** hello */\n/** */ function b () { }",
+            "/** */"
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php /** hello */\n/**\n*/ function b () { }",
+            "/**\n*/"
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            FunctionDeclaration::class,
+            "<?php function b () { }",
+            null
+        );
+
+        $this->AssertDocCommentTextOfNode(
+            \Microsoft\PhpParser\Node\Statement\InlineHtml::class,
+            "/** hello */ <?php function b () { }",
+            null
+        );
+    }
+
+    private function AssertDocCommentTextOfNode($nodeKind, $contents, $expectedDocCommentText) : array {
+        $parser = new Parser();
+        $ast = $parser->parseSourceFile($contents);
+        $functionDeclaration = $ast->getFirstDescendantNode($nodeKind);
+        $this->assertEquals(
+            $expectedDocCommentText,
+            $functionDeclaration->getDocCommentText()
+        );
+        return array($contents, $parser, $ast, $functionDeclaration);
+    }
 }