lexer.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869
  1. """Implements a Jinja / Python combination lexer. The ``Lexer`` class
  2. is used to do some preprocessing. It filters out invalid operators like
  3. the bitshift operators we don't allow in templates. It separates
  4. template code and python code in expressions.
  5. """
  6. import re
  7. import typing as t
  8. from ast import literal_eval
  9. from collections import deque
  10. from sys import intern
  11. from ._identifier import pattern as name_re
  12. from .exceptions import TemplateSyntaxError
  13. from .utils import LRUCache
  14. if t.TYPE_CHECKING:
  15. import typing_extensions as te
  16. from .environment import Environment
  17. # cache for the lexers. Exists in order to be able to have multiple
  18. # environments with the same lexer
  19. _lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50) # type: ignore
  20. # static regular expressions
  21. whitespace_re = re.compile(r"\s+")
  22. newline_re = re.compile(r"(\r\n|\r|\n)")
  23. string_re = re.compile(
  24. r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S
  25. )
  26. integer_re = re.compile(
  27. r"""
  28. (
  29. 0b(_?[0-1])+ # binary
  30. |
  31. 0o(_?[0-7])+ # octal
  32. |
  33. 0x(_?[\da-f])+ # hex
  34. |
  35. [1-9](_?\d)* # decimal
  36. |
  37. 0(_?0)* # decimal zero
  38. )
  39. """,
  40. re.IGNORECASE | re.VERBOSE,
  41. )
  42. float_re = re.compile(
  43. r"""
  44. (?<!\.) # doesn't start with a .
  45. (\d+_)*\d+ # digits, possibly _ separated
  46. (
  47. (\.(\d+_)*\d+)? # optional fractional part
  48. e[+\-]?(\d+_)*\d+ # exponent part
  49. |
  50. \.(\d+_)*\d+ # required fractional part
  51. )
  52. """,
  53. re.IGNORECASE | re.VERBOSE,
  54. )
  55. # internal the tokens and keep references to them
  56. TOKEN_ADD = intern("add")
  57. TOKEN_ASSIGN = intern("assign")
  58. TOKEN_COLON = intern("colon")
  59. TOKEN_COMMA = intern("comma")
  60. TOKEN_DIV = intern("div")
  61. TOKEN_DOT = intern("dot")
  62. TOKEN_EQ = intern("eq")
  63. TOKEN_FLOORDIV = intern("floordiv")
  64. TOKEN_GT = intern("gt")
  65. TOKEN_GTEQ = intern("gteq")
  66. TOKEN_LBRACE = intern("lbrace")
  67. TOKEN_LBRACKET = intern("lbracket")
  68. TOKEN_LPAREN = intern("lparen")
  69. TOKEN_LT = intern("lt")
  70. TOKEN_LTEQ = intern("lteq")
  71. TOKEN_MOD = intern("mod")
  72. TOKEN_MUL = intern("mul")
  73. TOKEN_NE = intern("ne")
  74. TOKEN_PIPE = intern("pipe")
  75. TOKEN_POW = intern("pow")
  76. TOKEN_RBRACE = intern("rbrace")
  77. TOKEN_RBRACKET = intern("rbracket")
  78. TOKEN_RPAREN = intern("rparen")
  79. TOKEN_SEMICOLON = intern("semicolon")
  80. TOKEN_SUB = intern("sub")
  81. TOKEN_TILDE = intern("tilde")
  82. TOKEN_WHITESPACE = intern("whitespace")
  83. TOKEN_FLOAT = intern("float")
  84. TOKEN_INTEGER = intern("integer")
  85. TOKEN_NAME = intern("name")
  86. TOKEN_STRING = intern("string")
  87. TOKEN_OPERATOR = intern("operator")
  88. TOKEN_BLOCK_BEGIN = intern("block_begin")
  89. TOKEN_BLOCK_END = intern("block_end")
  90. TOKEN_VARIABLE_BEGIN = intern("variable_begin")
  91. TOKEN_VARIABLE_END = intern("variable_end")
  92. TOKEN_RAW_BEGIN = intern("raw_begin")
  93. TOKEN_RAW_END = intern("raw_end")
  94. TOKEN_COMMENT_BEGIN = intern("comment_begin")
  95. TOKEN_COMMENT_END = intern("comment_end")
  96. TOKEN_COMMENT = intern("comment")
  97. TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin")
  98. TOKEN_LINESTATEMENT_END = intern("linestatement_end")
  99. TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin")
  100. TOKEN_LINECOMMENT_END = intern("linecomment_end")
  101. TOKEN_LINECOMMENT = intern("linecomment")
  102. TOKEN_DATA = intern("data")
  103. TOKEN_INITIAL = intern("initial")
  104. TOKEN_EOF = intern("eof")
  105. # bind operators to token types
  106. operators = {
  107. "+": TOKEN_ADD,
  108. "-": TOKEN_SUB,
  109. "/": TOKEN_DIV,
  110. "//": TOKEN_FLOORDIV,
  111. "*": TOKEN_MUL,
  112. "%": TOKEN_MOD,
  113. "**": TOKEN_POW,
  114. "~": TOKEN_TILDE,
  115. "[": TOKEN_LBRACKET,
  116. "]": TOKEN_RBRACKET,
  117. "(": TOKEN_LPAREN,
  118. ")": TOKEN_RPAREN,
  119. "{": TOKEN_LBRACE,
  120. "}": TOKEN_RBRACE,
  121. "==": TOKEN_EQ,
  122. "!=": TOKEN_NE,
  123. ">": TOKEN_GT,
  124. ">=": TOKEN_GTEQ,
  125. "<": TOKEN_LT,
  126. "<=": TOKEN_LTEQ,
  127. "=": TOKEN_ASSIGN,
  128. ".": TOKEN_DOT,
  129. ":": TOKEN_COLON,
  130. "|": TOKEN_PIPE,
  131. ",": TOKEN_COMMA,
  132. ";": TOKEN_SEMICOLON,
  133. }
  134. reverse_operators = {v: k for k, v in operators.items()}
  135. assert len(operators) == len(reverse_operators), "operators dropped"
  136. operator_re = re.compile(
  137. f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})"
  138. )
  139. ignored_tokens = frozenset(
  140. [
  141. TOKEN_COMMENT_BEGIN,
  142. TOKEN_COMMENT,
  143. TOKEN_COMMENT_END,
  144. TOKEN_WHITESPACE,
  145. TOKEN_LINECOMMENT_BEGIN,
  146. TOKEN_LINECOMMENT_END,
  147. TOKEN_LINECOMMENT,
  148. ]
  149. )
  150. ignore_if_empty = frozenset(
  151. [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT]
  152. )
  153. def _describe_token_type(token_type: str) -> str:
  154. if token_type in reverse_operators:
  155. return reverse_operators[token_type]
  156. return {
  157. TOKEN_COMMENT_BEGIN: "begin of comment",
  158. TOKEN_COMMENT_END: "end of comment",
  159. TOKEN_COMMENT: "comment",
  160. TOKEN_LINECOMMENT: "comment",
  161. TOKEN_BLOCK_BEGIN: "begin of statement block",
  162. TOKEN_BLOCK_END: "end of statement block",
  163. TOKEN_VARIABLE_BEGIN: "begin of print statement",
  164. TOKEN_VARIABLE_END: "end of print statement",
  165. TOKEN_LINESTATEMENT_BEGIN: "begin of line statement",
  166. TOKEN_LINESTATEMENT_END: "end of line statement",
  167. TOKEN_DATA: "template data / text",
  168. TOKEN_EOF: "end of template",
  169. }.get(token_type, token_type)
  170. def describe_token(token: "Token") -> str:
  171. """Returns a description of the token."""
  172. if token.type == TOKEN_NAME:
  173. return token.value
  174. return _describe_token_type(token.type)
  175. def describe_token_expr(expr: str) -> str:
  176. """Like `describe_token` but for token expressions."""
  177. if ":" in expr:
  178. type, value = expr.split(":", 1)
  179. if type == TOKEN_NAME:
  180. return value
  181. else:
  182. type = expr
  183. return _describe_token_type(type)
  184. def count_newlines(value: str) -> int:
  185. """Count the number of newline characters in the string. This is
  186. useful for extensions that filter a stream.
  187. """
  188. return len(newline_re.findall(value))
  189. def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]:
  190. """Compiles all the rules from the environment into a list of rules."""
  191. e = re.escape
  192. rules = [
  193. (
  194. len(environment.comment_start_string),
  195. TOKEN_COMMENT_BEGIN,
  196. e(environment.comment_start_string),
  197. ),
  198. (
  199. len(environment.block_start_string),
  200. TOKEN_BLOCK_BEGIN,
  201. e(environment.block_start_string),
  202. ),
  203. (
  204. len(environment.variable_start_string),
  205. TOKEN_VARIABLE_BEGIN,
  206. e(environment.variable_start_string),
  207. ),
  208. ]
  209. if environment.line_statement_prefix is not None:
  210. rules.append(
  211. (
  212. len(environment.line_statement_prefix),
  213. TOKEN_LINESTATEMENT_BEGIN,
  214. r"^[ \t\v]*" + e(environment.line_statement_prefix),
  215. )
  216. )
  217. if environment.line_comment_prefix is not None:
  218. rules.append(
  219. (
  220. len(environment.line_comment_prefix),
  221. TOKEN_LINECOMMENT_BEGIN,
  222. r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
  223. )
  224. )
  225. return [x[1:] for x in sorted(rules, reverse=True)]
  226. class Failure:
  227. """Class that raises a `TemplateSyntaxError` if called.
  228. Used by the `Lexer` to specify known errors.
  229. """
  230. def __init__(
  231. self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
  232. ) -> None:
  233. self.message = message
  234. self.error_class = cls
  235. def __call__(self, lineno: int, filename: str) -> "te.NoReturn":
  236. raise self.error_class(self.message, lineno, filename)
  237. class Token(t.NamedTuple):
  238. lineno: int
  239. type: str
  240. value: str
  241. def __str__(self) -> str:
  242. return describe_token(self)
  243. def test(self, expr: str) -> bool:
  244. """Test a token against a token expression. This can either be a
  245. token type or ``'token_type:token_value'``. This can only test
  246. against string values and types.
  247. """
  248. # here we do a regular string equality check as test_any is usually
  249. # passed an iterable of not interned strings.
  250. if self.type == expr:
  251. return True
  252. if ":" in expr:
  253. return expr.split(":", 1) == [self.type, self.value]
  254. return False
  255. def test_any(self, *iterable: str) -> bool:
  256. """Test against multiple token expressions."""
  257. return any(self.test(expr) for expr in iterable)
  258. class TokenStreamIterator:
  259. """The iterator for tokenstreams. Iterate over the stream
  260. until the eof token is reached.
  261. """
  262. def __init__(self, stream: "TokenStream") -> None:
  263. self.stream = stream
  264. def __iter__(self) -> "TokenStreamIterator":
  265. return self
  266. def __next__(self) -> Token:
  267. token = self.stream.current
  268. if token.type is TOKEN_EOF:
  269. self.stream.close()
  270. raise StopIteration
  271. next(self.stream)
  272. return token
  273. class TokenStream:
  274. """A token stream is an iterable that yields :class:`Token`\\s. The
  275. parser however does not iterate over it but calls :meth:`next` to go
  276. one token ahead. The current active token is stored as :attr:`current`.
  277. """
  278. def __init__(
  279. self,
  280. generator: t.Iterable[Token],
  281. name: t.Optional[str],
  282. filename: t.Optional[str],
  283. ):
  284. self._iter = iter(generator)
  285. self._pushed: "te.Deque[Token]" = deque()
  286. self.name = name
  287. self.filename = filename
  288. self.closed = False
  289. self.current = Token(1, TOKEN_INITIAL, "")
  290. next(self)
  291. def __iter__(self) -> TokenStreamIterator:
  292. return TokenStreamIterator(self)
  293. def __bool__(self) -> bool:
  294. return bool(self._pushed) or self.current.type is not TOKEN_EOF
  295. @property
  296. def eos(self) -> bool:
  297. """Are we at the end of the stream?"""
  298. return not self
  299. def push(self, token: Token) -> None:
  300. """Push a token back to the stream."""
  301. self._pushed.append(token)
  302. def look(self) -> Token:
  303. """Look at the next token."""
  304. old_token = next(self)
  305. result = self.current
  306. self.push(result)
  307. self.current = old_token
  308. return result
  309. def skip(self, n: int = 1) -> None:
  310. """Got n tokens ahead."""
  311. for _ in range(n):
  312. next(self)
  313. def next_if(self, expr: str) -> t.Optional[Token]:
  314. """Perform the token test and return the token if it matched.
  315. Otherwise the return value is `None`.
  316. """
  317. if self.current.test(expr):
  318. return next(self)
  319. return None
  320. def skip_if(self, expr: str) -> bool:
  321. """Like :meth:`next_if` but only returns `True` or `False`."""
  322. return self.next_if(expr) is not None
  323. def __next__(self) -> Token:
  324. """Go one token ahead and return the old one.
  325. Use the built-in :func:`next` instead of calling this directly.
  326. """
  327. rv = self.current
  328. if self._pushed:
  329. self.current = self._pushed.popleft()
  330. elif self.current.type is not TOKEN_EOF:
  331. try:
  332. self.current = next(self._iter)
  333. except StopIteration:
  334. self.close()
  335. return rv
  336. def close(self) -> None:
  337. """Close the stream."""
  338. self.current = Token(self.current.lineno, TOKEN_EOF, "")
  339. self._iter = iter(())
  340. self.closed = True
  341. def expect(self, expr: str) -> Token:
  342. """Expect a given token type and return it. This accepts the same
  343. argument as :meth:`jinja2.lexer.Token.test`.
  344. """
  345. if not self.current.test(expr):
  346. expr = describe_token_expr(expr)
  347. if self.current.type is TOKEN_EOF:
  348. raise TemplateSyntaxError(
  349. f"unexpected end of template, expected {expr!r}.",
  350. self.current.lineno,
  351. self.name,
  352. self.filename,
  353. )
  354. raise TemplateSyntaxError(
  355. f"expected token {expr!r}, got {describe_token(self.current)!r}",
  356. self.current.lineno,
  357. self.name,
  358. self.filename,
  359. )
  360. return next(self)
  361. def get_lexer(environment: "Environment") -> "Lexer":
  362. """Return a lexer which is probably cached."""
  363. key = (
  364. environment.block_start_string,
  365. environment.block_end_string,
  366. environment.variable_start_string,
  367. environment.variable_end_string,
  368. environment.comment_start_string,
  369. environment.comment_end_string,
  370. environment.line_statement_prefix,
  371. environment.line_comment_prefix,
  372. environment.trim_blocks,
  373. environment.lstrip_blocks,
  374. environment.newline_sequence,
  375. environment.keep_trailing_newline,
  376. )
  377. lexer = _lexer_cache.get(key)
  378. if lexer is None:
  379. _lexer_cache[key] = lexer = Lexer(environment)
  380. return lexer
  381. class OptionalLStrip(tuple):
  382. """A special tuple for marking a point in the state that can have
  383. lstrip applied.
  384. """
  385. __slots__ = ()
  386. # Even though it looks like a no-op, creating instances fails
  387. # without this.
  388. def __new__(cls, *members, **kwargs): # type: ignore
  389. return super().__new__(cls, members)
  390. class _Rule(t.NamedTuple):
  391. pattern: t.Pattern[str]
  392. tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]]
  393. command: t.Optional[str]
  394. class Lexer:
  395. """Class that implements a lexer for a given environment. Automatically
  396. created by the environment class, usually you don't have to do that.
  397. Note that the lexer is not automatically bound to an environment.
  398. Multiple environments can share the same lexer.
  399. """
  400. def __init__(self, environment: "Environment") -> None:
  401. # shortcuts
  402. e = re.escape
  403. def c(x: str) -> t.Pattern[str]:
  404. return re.compile(x, re.M | re.S)
  405. # lexing rules for tags
  406. tag_rules: t.List[_Rule] = [
  407. _Rule(whitespace_re, TOKEN_WHITESPACE, None),
  408. _Rule(float_re, TOKEN_FLOAT, None),
  409. _Rule(integer_re, TOKEN_INTEGER, None),
  410. _Rule(name_re, TOKEN_NAME, None),
  411. _Rule(string_re, TOKEN_STRING, None),
  412. _Rule(operator_re, TOKEN_OPERATOR, None),
  413. ]
  414. # assemble the root lexing rule. because "|" is ungreedy
  415. # we have to sort by length so that the lexer continues working
  416. # as expected when we have parsing rules like <% for block and
  417. # <%= for variables. (if someone wants asp like syntax)
  418. # variables are just part of the rules if variable processing
  419. # is required.
  420. root_tag_rules = compile_rules(environment)
  421. block_start_re = e(environment.block_start_string)
  422. block_end_re = e(environment.block_end_string)
  423. comment_end_re = e(environment.comment_end_string)
  424. variable_end_re = e(environment.variable_end_string)
  425. # block suffix if trimming is enabled
  426. block_suffix_re = "\\n?" if environment.trim_blocks else ""
  427. # If lstrip is enabled, it should not be applied if there is any
  428. # non-whitespace between the newline and block.
  429. self.lstrip_unless_re = c(r"[^ \t]") if environment.lstrip_blocks else None
  430. self.newline_sequence = environment.newline_sequence
  431. self.keep_trailing_newline = environment.keep_trailing_newline
  432. root_raw_re = (
  433. fr"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
  434. fr"(?:\-{block_end_re}\s*|{block_end_re}))"
  435. )
  436. root_parts_re = "|".join(
  437. [root_raw_re] + [fr"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
  438. )
  439. # global lexing rules
  440. self.rules: t.Dict[str, t.List[_Rule]] = {
  441. "root": [
  442. # directives
  443. _Rule(
  444. c(fr"(.*?)(?:{root_parts_re})"),
  445. OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore
  446. "#bygroup",
  447. ),
  448. # data
  449. _Rule(c(".+"), TOKEN_DATA, None),
  450. ],
  451. # comments
  452. TOKEN_COMMENT_BEGIN: [
  453. _Rule(
  454. c(
  455. fr"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
  456. fr"|{comment_end_re}{block_suffix_re}))"
  457. ),
  458. (TOKEN_COMMENT, TOKEN_COMMENT_END),
  459. "#pop",
  460. ),
  461. _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
  462. ],
  463. # blocks
  464. TOKEN_BLOCK_BEGIN: [
  465. _Rule(
  466. c(
  467. fr"(?:\+{block_end_re}|\-{block_end_re}\s*"
  468. fr"|{block_end_re}{block_suffix_re})"
  469. ),
  470. TOKEN_BLOCK_END,
  471. "#pop",
  472. ),
  473. ]
  474. + tag_rules,
  475. # variables
  476. TOKEN_VARIABLE_BEGIN: [
  477. _Rule(
  478. c(fr"\-{variable_end_re}\s*|{variable_end_re}"),
  479. TOKEN_VARIABLE_END,
  480. "#pop",
  481. )
  482. ]
  483. + tag_rules,
  484. # raw block
  485. TOKEN_RAW_BEGIN: [
  486. _Rule(
  487. c(
  488. fr"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
  489. fr"(?:\+{block_end_re}|\-{block_end_re}\s*"
  490. fr"|{block_end_re}{block_suffix_re}))"
  491. ),
  492. OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore
  493. "#pop",
  494. ),
  495. _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
  496. ],
  497. # line statements
  498. TOKEN_LINESTATEMENT_BEGIN: [
  499. _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
  500. ]
  501. + tag_rules,
  502. # line comments
  503. TOKEN_LINECOMMENT_BEGIN: [
  504. _Rule(
  505. c(r"(.*?)()(?=\n|$)"),
  506. (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
  507. "#pop",
  508. )
  509. ],
  510. }
  511. def _normalize_newlines(self, value: str) -> str:
  512. """Replace all newlines with the configured sequence in strings
  513. and template data.
  514. """
  515. return newline_re.sub(self.newline_sequence, value)
  516. def tokenize(
  517. self,
  518. source: str,
  519. name: t.Optional[str] = None,
  520. filename: t.Optional[str] = None,
  521. state: t.Optional[str] = None,
  522. ) -> TokenStream:
  523. """Calls tokeniter + tokenize and wraps it in a token stream."""
  524. stream = self.tokeniter(source, name, filename, state)
  525. return TokenStream(self.wrap(stream, name, filename), name, filename)
  526. def wrap(
  527. self,
  528. stream: t.Iterable[t.Tuple[int, str, str]],
  529. name: t.Optional[str] = None,
  530. filename: t.Optional[str] = None,
  531. ) -> t.Iterator[Token]:
  532. """This is called with the stream as returned by `tokenize` and wraps
  533. every token in a :class:`Token` and converts the value.
  534. """
  535. for lineno, token, value_str in stream:
  536. if token in ignored_tokens:
  537. continue
  538. value: t.Any = value_str
  539. if token == TOKEN_LINESTATEMENT_BEGIN:
  540. token = TOKEN_BLOCK_BEGIN
  541. elif token == TOKEN_LINESTATEMENT_END:
  542. token = TOKEN_BLOCK_END
  543. # we are not interested in those tokens in the parser
  544. elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
  545. continue
  546. elif token == TOKEN_DATA:
  547. value = self._normalize_newlines(value_str)
  548. elif token == "keyword":
  549. token = value_str
  550. elif token == TOKEN_NAME:
  551. value = value_str
  552. if not value.isidentifier():
  553. raise TemplateSyntaxError(
  554. "Invalid character in identifier", lineno, name, filename
  555. )
  556. elif token == TOKEN_STRING:
  557. # try to unescape string
  558. try:
  559. value = (
  560. self._normalize_newlines(value_str[1:-1])
  561. .encode("ascii", "backslashreplace")
  562. .decode("unicode-escape")
  563. )
  564. except Exception as e:
  565. msg = str(e).split(":")[-1].strip()
  566. raise TemplateSyntaxError(msg, lineno, name, filename) from e
  567. elif token == TOKEN_INTEGER:
  568. value = int(value_str.replace("_", ""), 0)
  569. elif token == TOKEN_FLOAT:
  570. # remove all "_" first to support more Python versions
  571. value = literal_eval(value_str.replace("_", ""))
  572. elif token == TOKEN_OPERATOR:
  573. token = operators[value_str]
  574. yield Token(lineno, token, value)
  575. def tokeniter(
  576. self,
  577. source: str,
  578. name: t.Optional[str],
  579. filename: t.Optional[str] = None,
  580. state: t.Optional[str] = None,
  581. ) -> t.Iterator[t.Tuple[int, str, str]]:
  582. """This method tokenizes the text and returns the tokens in a
  583. generator. Use this method if you just want to tokenize a template.
  584. .. versionchanged:: 3.0
  585. Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
  586. breaks.
  587. """
  588. lines = newline_re.split(source)[::2]
  589. if not self.keep_trailing_newline and lines[-1] == "":
  590. del lines[-1]
  591. source = "\n".join(lines)
  592. pos = 0
  593. lineno = 1
  594. stack = ["root"]
  595. if state is not None and state != "root":
  596. assert state in ("variable", "block"), "invalid state"
  597. stack.append(state + "_begin")
  598. statetokens = self.rules[stack[-1]]
  599. source_length = len(source)
  600. balancing_stack: t.List[str] = []
  601. lstrip_unless_re = self.lstrip_unless_re
  602. newlines_stripped = 0
  603. line_starting = True
  604. while True:
  605. # tokenizer loop
  606. for regex, tokens, new_state in statetokens:
  607. m = regex.match(source, pos)
  608. # if no match we try again with the next rule
  609. if m is None:
  610. continue
  611. # we only match blocks and variables if braces / parentheses
  612. # are balanced. continue parsing with the lower rule which
  613. # is the operator rule. do this only if the end tags look
  614. # like operators
  615. if balancing_stack and tokens in (
  616. TOKEN_VARIABLE_END,
  617. TOKEN_BLOCK_END,
  618. TOKEN_LINESTATEMENT_END,
  619. ):
  620. continue
  621. # tuples support more options
  622. if isinstance(tokens, tuple):
  623. groups = m.groups()
  624. if isinstance(tokens, OptionalLStrip):
  625. # Rule supports lstrip. Match will look like
  626. # text, block type, whitespace control, type, control, ...
  627. text = groups[0]
  628. # Skipping the text and first type, every other group is the
  629. # whitespace control for each type. One of the groups will be
  630. # -, +, or empty string instead of None.
  631. strip_sign = next(g for g in groups[2::2] if g is not None)
  632. if strip_sign == "-":
  633. # Strip all whitespace between the text and the tag.
  634. stripped = text.rstrip()
  635. newlines_stripped = text[len(stripped) :].count("\n")
  636. groups = [stripped, *groups[1:]]
  637. elif (
  638. # Not marked for preserving whitespace.
  639. strip_sign != "+"
  640. # lstrip is enabled.
  641. and lstrip_unless_re is not None
  642. # Not a variable expression.
  643. and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
  644. ):
  645. # The start of text between the last newline and the tag.
  646. l_pos = text.rfind("\n") + 1
  647. if l_pos > 0 or line_starting:
  648. # If there's only whitespace between the newline and the
  649. # tag, strip it.
  650. if not lstrip_unless_re.search(text, l_pos):
  651. groups = [text[:l_pos], *groups[1:]]
  652. for idx, token in enumerate(tokens):
  653. # failure group
  654. if token.__class__ is Failure:
  655. raise token(lineno, filename)
  656. # bygroup is a bit more complex, in that case we
  657. # yield for the current token the first named
  658. # group that matched
  659. elif token == "#bygroup":
  660. for key, value in m.groupdict().items():
  661. if value is not None:
  662. yield lineno, key, value
  663. lineno += value.count("\n")
  664. break
  665. else:
  666. raise RuntimeError(
  667. f"{regex!r} wanted to resolve the token dynamically"
  668. " but no group matched"
  669. )
  670. # normal group
  671. else:
  672. data = groups[idx]
  673. if data or token not in ignore_if_empty:
  674. yield lineno, token, data
  675. lineno += data.count("\n") + newlines_stripped
  676. newlines_stripped = 0
  677. # strings as token just are yielded as it.
  678. else:
  679. data = m.group()
  680. # update brace/parentheses balance
  681. if tokens == TOKEN_OPERATOR:
  682. if data == "{":
  683. balancing_stack.append("}")
  684. elif data == "(":
  685. balancing_stack.append(")")
  686. elif data == "[":
  687. balancing_stack.append("]")
  688. elif data in ("}", ")", "]"):
  689. if not balancing_stack:
  690. raise TemplateSyntaxError(
  691. f"unexpected '{data}'", lineno, name, filename
  692. )
  693. expected_op = balancing_stack.pop()
  694. if expected_op != data:
  695. raise TemplateSyntaxError(
  696. f"unexpected '{data}', expected '{expected_op}'",
  697. lineno,
  698. name,
  699. filename,
  700. )
  701. # yield items
  702. if data or tokens not in ignore_if_empty:
  703. yield lineno, tokens, data
  704. lineno += data.count("\n")
  705. line_starting = m.group()[-1:] == "\n"
  706. # fetch new position into new variable so that we can check
  707. # if there is a internal parsing error which would result
  708. # in an infinite loop
  709. pos2 = m.end()
  710. # handle state changes
  711. if new_state is not None:
  712. # remove the uppermost state
  713. if new_state == "#pop":
  714. stack.pop()
  715. # resolve the new state by group checking
  716. elif new_state == "#bygroup":
  717. for key, value in m.groupdict().items():
  718. if value is not None:
  719. stack.append(key)
  720. break
  721. else:
  722. raise RuntimeError(
  723. f"{regex!r} wanted to resolve the new state dynamically"
  724. f" but no group matched"
  725. )
  726. # direct state name given
  727. else:
  728. stack.append(new_state)
  729. statetokens = self.rules[stack[-1]]
  730. # we are still at the same position and no stack change.
  731. # this means a loop without break condition, avoid that and
  732. # raise error
  733. elif pos2 == pos:
  734. raise RuntimeError(
  735. f"{regex!r} yielded empty string without stack change"
  736. )
  737. # publish new function and start again
  738. pos = pos2
  739. break
  740. # if loop terminated without break we haven't found a single match
  741. # either we are at the end of the file or we have a problem
  742. else:
  743. # end of text
  744. if pos >= source_length:
  745. return
  746. # something went wrong
  747. raise TemplateSyntaxError(
  748. f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
  749. )