Files
ahriman/src/ahriman/core/alpm/pkgbuild_parser.py
Evgenii Alekseev 343435b3bf fix: fix pkgbuild parsing in some cases
It has been found that there are two cases in which pkgbuild was not
parsed correctly

1. Major case in which there is quotation mark inside comment line,
   which would cause ValueError: No closing quotation error
2. Minor case, if there are utf symbols in pkgbuild file (e.g.
   hieroglyphs, see ttf-google-fonts-git), it will case incorrect
   reading in `_is_escaped` method
2024-09-26 16:48:38 +03:00

344 lines
13 KiB
Python

#
# Copyright (c) 2021-2024 ahriman team.
#
# This file is part of ahriman
# (see https://github.com/arcan1s/ahriman).
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import itertools
import re
import shlex
from collections.abc import Generator
from enum import StrEnum
from typing import IO
from ahriman.core.exceptions import PkgbuildParserError
from ahriman.models.pkgbuild_patch import PkgbuildPatch
class PkgbuildToken(StrEnum):
"""
well-known tokens dictionary
Attributes:
ArrayEnds(PkgbuildToken): (class attribute) array ends token
ArrayStarts(PkgbuildToken): (class attribute) array starts token
Comma(PkgbuildToken): (class attribute) comma token
Comment(PkgbuildToken): (class attribute) comment token
FunctionDeclaration(PkgbuildToken): (class attribute) function declaration token
FunctionEnds(PkgbuildToken): (class attribute) function ends token
FunctionStarts(PkgbuildToken): (class attribute) function starts token
"""
ArrayStarts = "("
ArrayEnds = ")"
Comma = ","
Comment = "#"
FunctionDeclaration = "()"
FunctionStarts = "{"
FunctionEnds = "}"
class PkgbuildParser(shlex.shlex):
"""
simple pkgbuild reader implementation in pure python, because others suck.
What is it:
#. Simple PKGBUILD parser written in python.
#. No shell execution, so it is free from random shell attacks.
#. Able to parse simple constructions (assignments, comments, functions, arrays).
What it is not:
#. Fully functional shell parser.
#. Shell executor.
#. No parameter expansion.
For more details what does it support, please, consult with the test cases.
Examples:
This class is heavily based on :mod:`shlex` parser, but instead of strings operates with the
:class:`ahriman.models.pkgbuild_patch.PkgbuildPatch` objects. The main way to use it is to call :func:`parse()`
function and collect parsed objects, e.g.::
>>> parser = PkgbuildParser(StringIO("input string"))
>>> for patch in parser.parse():
>>> print(f"{patch.key} = {patch.value}")
It doesn't store the state of the fields (but operates with the :mod:`shlex` parser state), so no shell
post-processing is performed (e.g. variable substitution).
"""
_ARRAY_ASSIGNMENT = re.compile(r"^(?P<key>\w+)=$")
# in addition to usual assignment, functions can have dash
_FUNCTION_DECLARATION = re.compile(r"^(?P<key>[\w-]+)$")
_STRING_ASSIGNMENT = re.compile(r"^(?P<key>\w+)=(?P<value>.+)$")
def __init__(self, stream: IO[str]) -> None:
"""
Args:
stream(IO[str]): input stream containing PKGBUILD content
"""
shlex.shlex.__init__(self, stream, posix=True, punctuation_chars=True)
self._io = stream # direct access without type casting
# ignore substitution and extend bash symbols
self.wordchars += "${}#:+-@!"
# in case of default behaviour, it will ignore, for example, segment part of url outside of quotes
self.commenters = ""
@staticmethod
def _expand_array(array: list[str]) -> list[str]:
"""
bash array expansion simulator. It takes raw array and tries to expand constructions like
``(first prefix-{mid1,mid2}-suffix last)`` into ``(first, prefix-mid1-suffix prefix-mid2-suffix last)``
Args:
array(list[str]): input array
Returns:
list[str]: either source array or expanded array if possible
Raises:
PkgbuildParserError: if there are errors in parser
"""
# we are using comma as marker for expansion (if any)
if PkgbuildToken.Comma not in array:
return array
# again sanity check, for expansion there are at least 3 elements (first, last and comma)
if len(array) < 3:
return array
result = []
buffer, prefix = [], None
for index, (first, second) in enumerate(itertools.pairwise(array)):
match (first, second):
# in this case we check if expansion should be started
# this condition matches "prefix{first", ","
case (_, PkgbuildToken.Comma) if PkgbuildToken.FunctionStarts in first:
prefix, part = first.rsplit(PkgbuildToken.FunctionStarts, maxsplit=1)
buffer.append(f"{prefix}{part}")
# the last element case, it matches either ",", "last}" or ",", "last}suffix"
# in case if there is suffix, it must be appended to all list elements
case (PkgbuildToken.Comma, _) if prefix is not None and PkgbuildToken.FunctionEnds in second:
part, suffix = second.rsplit(PkgbuildToken.FunctionEnds, maxsplit=1)
buffer.append(f"{prefix}{part}")
result.extend([f"{part}{suffix}" for part in buffer])
# reset state
buffer, prefix = [], None
# we have already prefix string, so we are in progress of expansion
# we always operate the last element, so this matches ",", "next"
case (PkgbuildToken.Comma, _) if prefix is not None:
buffer.append(f"{prefix}{second}")
# exactly first element of the list
case (_, _) if prefix is None and index == 0:
result.append(first)
# any next normal element
case (_, _) if prefix is None:
result.append(second)
# small sanity check
if prefix is not None:
raise PkgbuildParserError("error in array expansion", array)
return result
def _is_escaped(self) -> bool:
"""
check if the last element was quoted. ``shlex.shlex`` parser doesn't provide information about was the token
quoted or not, thus there is no difference between "'#'" (diez in quotes) and "#" (diez without quotes). This
method simply rolls back to the last non-space character and check if it is a quotation mark
Returns:
bool: ``True`` if the previous element of the stream is a quote or escaped and ``False`` otherwise
"""
# wrapper around reading utf symbols from random position of the stream
def read_last() -> tuple[int, str]:
while (position := self._io.tell()) > 0:
try:
return position, self._io.read(1)
except UnicodeDecodeError:
self._io.seek(position - 1)
raise PkgbuildParserError("reached starting position, no valid symbols found")
current_position = self._io.tell()
last_char = penultimate_char = None
index = current_position - 1
while index > 0:
self._io.seek(index)
index, last_char = read_last()
if last_char.isspace():
index -= 1
continue
if index > 1:
self._io.seek(index - 1)
_, penultimate_char = read_last()
break
self._io.seek(current_position) # reset position of the stream
is_quoted = last_char is not None and last_char in self.quotes
is_escaped = penultimate_char is not None and penultimate_char in self.escape
return is_quoted or is_escaped
def _parse_array(self) -> list[str]:
"""
parse array from the PKGBUILD. This method will extract tokens from parser until it matches closing array,
modifying source parser state
Returns:
list[str]: extracted arrays elements
Raises:
PkgbuildParserError: if array is not closed
"""
def extract() -> Generator[str, None, None]:
while token := self.get_token():
match token:
case _ if self._is_escaped():
pass
case PkgbuildToken.ArrayEnds:
break
case PkgbuildToken.Comment:
self.instream.readline()
continue
yield token
if token != PkgbuildToken.ArrayEnds:
raise PkgbuildParserError("no closing array bracket found")
return self._expand_array(list(extract()))
def _parse_function(self) -> str:
"""
parse function from the PKGBUILD. This method will extract tokens from parser until it matches closing function,
modifying source parser state. Instead of trying to combine tokens together, it uses positions of the file
and reads content again in this range
Returns:
str: function body
Raises:
PkgbuildParserError: if function body wasn't found or parser input stream doesn't support position reading
"""
# find start and end positions
start_position = end_position = -1
counter = 0 # simple processing of the inner "{" and "}"
for token in self:
match token:
case _ if self._is_escaped():
continue
case PkgbuildToken.FunctionStarts:
if counter == 0:
start_position = self._io.tell() - 1
counter += 1
case PkgbuildToken.FunctionEnds:
end_position = self._io.tell()
if self.state != self.eof: # type: ignore[attr-defined]
end_position -= 1 # if we are not at the end of the file, position is _after_ the token
counter -= 1
if counter == 0:
break
case PkgbuildToken.Comment:
self.instream.readline()
if not 0 < start_position < end_position:
raise PkgbuildParserError("function body wasn't found")
# read the specified interval from source stream
self._io.seek(start_position - 1) # start from the previous symbol
# we cannot use :func:`read()` here, because it reads characters, not bytes
content = ""
while self._io.tell() != end_position and (next_char := self._io.read(1)):
content += next_char
# special case of the end of file
if self.state == self.eof: # type: ignore[attr-defined]
content += self._io.read(1)
return content
def _parse_token(self, token: str) -> Generator[PkgbuildPatch, None, None]:
"""
parse single token to the PKGBUILD field
Args:
token(str): current token
Yields:
PkgbuildPatch: extracted a PKGBUILD node
"""
# simple assignment rule
if m := self._STRING_ASSIGNMENT.match(token):
key = m.group("key")
value = m.group("value")
yield PkgbuildPatch(key, value)
return
if token == PkgbuildToken.Comment:
self.instream.readline()
return
match self.get_token():
# array processing. Arrays will be sent as "key=", "(", values, ")"
case PkgbuildToken.ArrayStarts if m := self._ARRAY_ASSIGNMENT.match(token):
key = m.group("key")
value = self._parse_array()
yield PkgbuildPatch(key, value)
# functions processing. Function will be sent as "name", "()", "{", body, "}"
case PkgbuildToken.FunctionDeclaration if self._FUNCTION_DECLARATION.match(token):
key = f"{token}{PkgbuildToken.FunctionDeclaration}"
value = self._parse_function()
yield PkgbuildPatch(key, value) # this is not mistake, assign to token without ()
# special function case, where "(" and ")" are separated tokens, e.g. "pkgver ( )"
case PkgbuildToken.ArrayStarts if self._FUNCTION_DECLARATION.match(token):
next_token = self.get_token()
if next_token == PkgbuildToken.ArrayEnds: # replace closing bracket with "()"
next_token = PkgbuildToken.FunctionDeclaration
self.push_token(next_token) # type: ignore[arg-type]
yield from self._parse_token(token)
# some random token received without continuation, lets guess it is empty assignment (i.e. key=)
case other if other is not None:
yield from self._parse_token(other)
def parse(self) -> Generator[PkgbuildPatch, None, None]:
"""
parse source stream and yield parsed entries
Yields:
PkgbuildPatch: extracted a PKGBUILD node
"""
for token in self:
yield from self._parse_token(token)