rfc:token_as_object
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
rfc:token_as_object [2020/02/13 10:22] – Remove T_BAD_CHARACTER from ignorable tokens nikic | rfc:token_as_object [2020/11/12 13:33] (current) – nikic | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== PHP RFC: token_get_all() | + | ====== PHP RFC: Object-based |
* Date: 2020-02-13 | * Date: 2020-02-13 | ||
* Author: Nikita Popov < | * Author: Nikita Popov < | ||
- | * Status: | + | * Status: |
* Target Version: PHP 8.0 | * Target Version: PHP 8.0 | ||
* Implementation: | * Implementation: | ||
Line 8: | Line 8: | ||
===== Introduction ===== | ===== Introduction ===== | ||
- | The '' | + | The '' |
+ | |||
+ | > **Note:** PhpToken:: | ||
===== Proposal ===== | ===== Proposal ===== | ||
- | '' | + | A new '' |
<PHP> | <PHP> | ||
Line 24: | Line 26: | ||
/** The starting position (0-based) in the tokenized string. */ | /** The starting position (0-based) in the tokenized string. */ | ||
public int $pos; | public int $pos; | ||
+ | | ||
+ | /** | ||
+ | * Same as token_get_all(), | ||
+ | * @return static[] | ||
+ | */ | ||
+ | public static function getAll(string $code, int $flags = 0): array; | ||
+ | | ||
+ | final public function __construct(int $id, string $text, int $line = -1, int $pos = -1); | ||
+ | |||
+ | /** Get the name of the token. */ | ||
+ | public function getTokenName(): | ||
+ | | ||
+ | /** | ||
+ | * Whether the token has the given ID, the given text, | ||
+ | * or has an ID/text part of the given array. | ||
+ | | ||
+ | * @param int|string|array $kind | ||
+ | */ | ||
+ | public function is($kind): bool; | ||
+ | |||
+ | /** Whether this token would be ignored by the PHP parser. */ | ||
+ | public function isIgnorable(): | ||
} | } | ||
</ | </ | ||
+ | |||
+ | The '' | ||
It should be emphasized that **all** tokens are returned as objects, including single-char tokens. While this uses more memory than returning them as strings, experience has shown that the current string/ | It should be emphasized that **all** tokens are returned as objects, including single-char tokens. While this uses more memory than returning them as strings, experience has shown that the current string/ | ||
Line 35: | Line 61: | ||
* The token stores the position in the file, so that consumers don't have to compute and store it separately. | * The token stores the position in the file, so that consumers don't have to compute and store it separately. | ||
- | Finally | + | Finally, the tokens take up significantly |
< | < | ||
Line 45: | Line 71: | ||
Time: 0.32s (for 100 tokenizations) | Time: 0.32s (for 100 tokenizations) | ||
</ | </ | ||
+ | |||
+ | ==== Extensibility ==== | ||
+ | |||
+ | The '' | ||
+ | |||
+ | <PHP> | ||
+ | class MyPhpToken extends PhpToken { | ||
+ | public function getLowerText() { | ||
+ | return strtolower($this-> | ||
+ | } | ||
+ | } | ||
+ | |||
+ | $tokens = MyPhpToken:: | ||
+ | var_dump($tokens[0] instanceof MyPhpToken); | ||
+ | $tokens[0]-> | ||
+ | </ | ||
+ | |||
+ | To guarantee a well-defined construction behavior, the '' | ||
==== Additional methods ==== | ==== Additional methods ==== | ||
- | There are a few useful helper methods that could be added to the '' | + | The '' |
- | Three suggestions are given as PHP code below. The '' | + | <PHP> |
+ | public function getTokenName(): ?string { | ||
+ | if ($this-> | ||
+ | return chr($this-> | ||
+ | } elseif ('UNKNOWN' | ||
+ | return $name; | ||
+ | } else { | ||
+ | return null; | ||
+ | } | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | '' | ||
+ | |||
+ | It should be noted that tokens that are not known to PHP are commonly used, for example when emulating lexer behavior from future PHP versions. In this case custom token IDs are used, so they should be handled gracefully. | ||
<PHP> | <PHP> | ||
- | class PhpToken { | + | public function is($kind): bool { |
- | /** Whether the token has the given ID, the given text, | + | if (is_array($kind)) { |
- | | + | foreach ($kind as $singleKind) { |
- | | + | if (is_string($singleKind)) { |
- | if (is_array($kind)) { | + | if ($this-> |
- | foreach ($kind as $singleKind) { | + | return true; |
- | if (is_string($singleKind)) { | + | } |
- | if ($this-> | + | } else if (is_int($singleKind)) { |
- | return true; | + | if ($this-> |
- | } | + | return true; |
- | } else if (is_int($singleKind)) { | + | |
- | if ($this-> | + | |
- | return true; | + | |
- | } | + | |
- | } else { | + | |
- | throw new TypeError(" | + | |
} | } | ||
+ | } else { | ||
+ | throw new TypeError(" | ||
} | } | ||
- | return false; | ||
- | } else if (is_string($kind)) { | ||
- | return $this-> | ||
- | } else if (is_int($kind)) { | ||
- | return $this-> | ||
- | } else { | ||
- | throw new TypeError(" | ||
} | } | ||
+ | return false; | ||
+ | } else if (is_string($kind)) { | ||
+ | return $this-> | ||
+ | } else if (is_int($kind)) { | ||
+ | return $this-> | ||
+ | } else { | ||
+ | throw new TypeError(" | ||
} | } | ||
+ | } | ||
+ | </ | ||
- | /** Whether this token would be ignored by the PHP parser. */ | + | The '' |
- | public function isIgnorable(): bool { | + | |
- | return | + | |
- | T_WHITESPACE, | + | |
- | T_COMMENT, | + | |
- | T_DOC_COMMENT, | + | |
- | T_OPEN_TAG, | + | |
- | | + | |
- | } | + | |
- | /** Get the name of the token. | + | While non-generic code can easily check the appropriate property, such as '' |
- | public function | + | |
- | if ($this->id < 256) { | + | < |
- | | + | // An example, NOT part of the PhpToken interface. |
- | } else { | + | public function |
- | return | + | |
+ | for ($count = \count($tokens); | ||
+ | | ||
+ | return $pos; | ||
} | } | ||
} | } | ||
+ | return -1; | ||
} | } | ||
</ | </ | ||
+ | |||
+ | These kinds of search/ | ||
+ | |||
+ | <PHP> | ||
+ | public function isIgnorable(): | ||
+ | return $this-> | ||
+ | T_WHITESPACE, | ||
+ | T_COMMENT, | ||
+ | T_DOC_COMMENT, | ||
+ | T_OPEN_TAG, | ||
+ | ]); | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | As a special case, it is very common that whitespace and comments need to be skipped during token processing. The '' | ||
+ | |||
+ | ===== Rejected Features ===== | ||
+ | |||
+ | ==== Lazy token stream ==== | ||
+ | |||
+ | '' | ||
+ | |||
+ | This is not supported by the current proposal, because the current PHP lexer doesn' | ||
===== Backward Incompatible Changes ===== | ===== Backward Incompatible Changes ===== | ||
- | There are no backwards compatibility breaks, apart from the new constant name and the new class name. | + | There are no backwards compatibility breaks, apart from the new class name. |
===== Vote ===== | ===== Vote ===== | ||
- | Yes / No. | + | Voting opened 2020-03-06 and closes 2020-03-20. |
+ | |||
+ | <doodle title=" | ||
+ | | ||
+ | | ||
+ | </ | ||
rfc/token_as_object.1581589337.txt.gz · Last modified: 2020/02/13 10:22 by nikic