mirror of
https://github.com/arcan1s/ahriman.git
synced 2025-07-14 06:25:46 +00:00
It has been found that there are two cases in which pkgbuild was not parsed correctly 1. Major case in which there is quotation mark inside comment line, which would cause ValueError: No closing quotation error 2. Minor case, if there are utf symbols in pkgbuild file (e.g. hieroglyphs, see ttf-google-fonts-git), it will case incorrect reading in `_is_escaped` method
344 lines
13 KiB
Python
344 lines
13 KiB
Python
#
|
|
# Copyright (c) 2021-2024 ahriman team.
|
|
#
|
|
# This file is part of ahriman
|
|
# (see https://github.com/arcan1s/ahriman).
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
import itertools
|
|
import re
|
|
import shlex
|
|
|
|
from collections.abc import Generator
|
|
from enum import StrEnum
|
|
from typing import IO
|
|
|
|
from ahriman.core.exceptions import PkgbuildParserError
|
|
from ahriman.models.pkgbuild_patch import PkgbuildPatch
|
|
|
|
|
|
class PkgbuildToken(StrEnum):
|
|
"""
|
|
well-known tokens dictionary
|
|
|
|
Attributes:
|
|
ArrayEnds(PkgbuildToken): (class attribute) array ends token
|
|
ArrayStarts(PkgbuildToken): (class attribute) array starts token
|
|
Comma(PkgbuildToken): (class attribute) comma token
|
|
Comment(PkgbuildToken): (class attribute) comment token
|
|
FunctionDeclaration(PkgbuildToken): (class attribute) function declaration token
|
|
FunctionEnds(PkgbuildToken): (class attribute) function ends token
|
|
FunctionStarts(PkgbuildToken): (class attribute) function starts token
|
|
"""
|
|
|
|
ArrayStarts = "("
|
|
ArrayEnds = ")"
|
|
|
|
Comma = ","
|
|
|
|
Comment = "#"
|
|
|
|
FunctionDeclaration = "()"
|
|
FunctionStarts = "{"
|
|
FunctionEnds = "}"
|
|
|
|
|
|
class PkgbuildParser(shlex.shlex):
|
|
"""
|
|
simple pkgbuild reader implementation in pure python, because others suck.
|
|
|
|
What is it:
|
|
|
|
#. Simple PKGBUILD parser written in python.
|
|
#. No shell execution, so it is free from random shell attacks.
|
|
#. Able to parse simple constructions (assignments, comments, functions, arrays).
|
|
|
|
What it is not:
|
|
|
|
#. Fully functional shell parser.
|
|
#. Shell executor.
|
|
#. No parameter expansion.
|
|
|
|
For more details what does it support, please, consult with the test cases.
|
|
|
|
Examples:
|
|
This class is heavily based on :mod:`shlex` parser, but instead of strings operates with the
|
|
:class:`ahriman.models.pkgbuild_patch.PkgbuildPatch` objects. The main way to use it is to call :func:`parse()`
|
|
function and collect parsed objects, e.g.::
|
|
|
|
>>> parser = PkgbuildParser(StringIO("input string"))
|
|
>>> for patch in parser.parse():
|
|
>>> print(f"{patch.key} = {patch.value}")
|
|
|
|
It doesn't store the state of the fields (but operates with the :mod:`shlex` parser state), so no shell
|
|
post-processing is performed (e.g. variable substitution).
|
|
"""
|
|
|
|
_ARRAY_ASSIGNMENT = re.compile(r"^(?P<key>\w+)=$")
|
|
# in addition to usual assignment, functions can have dash
|
|
_FUNCTION_DECLARATION = re.compile(r"^(?P<key>[\w-]+)$")
|
|
_STRING_ASSIGNMENT = re.compile(r"^(?P<key>\w+)=(?P<value>.+)$")
|
|
|
|
def __init__(self, stream: IO[str]) -> None:
|
|
"""
|
|
Args:
|
|
stream(IO[str]): input stream containing PKGBUILD content
|
|
"""
|
|
shlex.shlex.__init__(self, stream, posix=True, punctuation_chars=True)
|
|
self._io = stream # direct access without type casting
|
|
|
|
# ignore substitution and extend bash symbols
|
|
self.wordchars += "${}#:+-@!"
|
|
# in case of default behaviour, it will ignore, for example, segment part of url outside of quotes
|
|
self.commenters = ""
|
|
|
|
@staticmethod
|
|
def _expand_array(array: list[str]) -> list[str]:
|
|
"""
|
|
bash array expansion simulator. It takes raw array and tries to expand constructions like
|
|
``(first prefix-{mid1,mid2}-suffix last)`` into ``(first, prefix-mid1-suffix prefix-mid2-suffix last)``
|
|
|
|
Args:
|
|
array(list[str]): input array
|
|
|
|
Returns:
|
|
list[str]: either source array or expanded array if possible
|
|
|
|
Raises:
|
|
PkgbuildParserError: if there are errors in parser
|
|
"""
|
|
# we are using comma as marker for expansion (if any)
|
|
if PkgbuildToken.Comma not in array:
|
|
return array
|
|
# again sanity check, for expansion there are at least 3 elements (first, last and comma)
|
|
if len(array) < 3:
|
|
return array
|
|
|
|
result = []
|
|
buffer, prefix = [], None
|
|
|
|
for index, (first, second) in enumerate(itertools.pairwise(array)):
|
|
match (first, second):
|
|
# in this case we check if expansion should be started
|
|
# this condition matches "prefix{first", ","
|
|
case (_, PkgbuildToken.Comma) if PkgbuildToken.FunctionStarts in first:
|
|
prefix, part = first.rsplit(PkgbuildToken.FunctionStarts, maxsplit=1)
|
|
buffer.append(f"{prefix}{part}")
|
|
|
|
# the last element case, it matches either ",", "last}" or ",", "last}suffix"
|
|
# in case if there is suffix, it must be appended to all list elements
|
|
case (PkgbuildToken.Comma, _) if prefix is not None and PkgbuildToken.FunctionEnds in second:
|
|
part, suffix = second.rsplit(PkgbuildToken.FunctionEnds, maxsplit=1)
|
|
buffer.append(f"{prefix}{part}")
|
|
result.extend([f"{part}{suffix}" for part in buffer])
|
|
# reset state
|
|
buffer, prefix = [], None
|
|
|
|
# we have already prefix string, so we are in progress of expansion
|
|
# we always operate the last element, so this matches ",", "next"
|
|
case (PkgbuildToken.Comma, _) if prefix is not None:
|
|
buffer.append(f"{prefix}{second}")
|
|
|
|
# exactly first element of the list
|
|
case (_, _) if prefix is None and index == 0:
|
|
result.append(first)
|
|
|
|
# any next normal element
|
|
case (_, _) if prefix is None:
|
|
result.append(second)
|
|
|
|
# small sanity check
|
|
if prefix is not None:
|
|
raise PkgbuildParserError("error in array expansion", array)
|
|
|
|
return result
|
|
|
|
def _is_escaped(self) -> bool:
|
|
"""
|
|
check if the last element was quoted. ``shlex.shlex`` parser doesn't provide information about was the token
|
|
quoted or not, thus there is no difference between "'#'" (diez in quotes) and "#" (diez without quotes). This
|
|
method simply rolls back to the last non-space character and check if it is a quotation mark
|
|
|
|
Returns:
|
|
bool: ``True`` if the previous element of the stream is a quote or escaped and ``False`` otherwise
|
|
"""
|
|
# wrapper around reading utf symbols from random position of the stream
|
|
def read_last() -> tuple[int, str]:
|
|
while (position := self._io.tell()) > 0:
|
|
try:
|
|
return position, self._io.read(1)
|
|
except UnicodeDecodeError:
|
|
self._io.seek(position - 1)
|
|
|
|
raise PkgbuildParserError("reached starting position, no valid symbols found")
|
|
|
|
current_position = self._io.tell()
|
|
|
|
last_char = penultimate_char = None
|
|
index = current_position - 1
|
|
while index > 0:
|
|
self._io.seek(index)
|
|
|
|
index, last_char = read_last()
|
|
if last_char.isspace():
|
|
index -= 1
|
|
continue
|
|
|
|
if index > 1:
|
|
self._io.seek(index - 1)
|
|
_, penultimate_char = read_last()
|
|
|
|
break
|
|
|
|
self._io.seek(current_position) # reset position of the stream
|
|
is_quoted = last_char is not None and last_char in self.quotes
|
|
is_escaped = penultimate_char is not None and penultimate_char in self.escape
|
|
|
|
return is_quoted or is_escaped
|
|
|
|
def _parse_array(self) -> list[str]:
|
|
"""
|
|
parse array from the PKGBUILD. This method will extract tokens from parser until it matches closing array,
|
|
modifying source parser state
|
|
|
|
Returns:
|
|
list[str]: extracted arrays elements
|
|
|
|
Raises:
|
|
PkgbuildParserError: if array is not closed
|
|
"""
|
|
def extract() -> Generator[str, None, None]:
|
|
while token := self.get_token():
|
|
match token:
|
|
case _ if self._is_escaped():
|
|
pass
|
|
case PkgbuildToken.ArrayEnds:
|
|
break
|
|
case PkgbuildToken.Comment:
|
|
self.instream.readline()
|
|
continue
|
|
|
|
yield token
|
|
|
|
if token != PkgbuildToken.ArrayEnds:
|
|
raise PkgbuildParserError("no closing array bracket found")
|
|
|
|
return self._expand_array(list(extract()))
|
|
|
|
def _parse_function(self) -> str:
|
|
"""
|
|
parse function from the PKGBUILD. This method will extract tokens from parser until it matches closing function,
|
|
modifying source parser state. Instead of trying to combine tokens together, it uses positions of the file
|
|
and reads content again in this range
|
|
|
|
Returns:
|
|
str: function body
|
|
|
|
Raises:
|
|
PkgbuildParserError: if function body wasn't found or parser input stream doesn't support position reading
|
|
"""
|
|
# find start and end positions
|
|
start_position = end_position = -1
|
|
counter = 0 # simple processing of the inner "{" and "}"
|
|
for token in self:
|
|
match token:
|
|
case _ if self._is_escaped():
|
|
continue
|
|
case PkgbuildToken.FunctionStarts:
|
|
if counter == 0:
|
|
start_position = self._io.tell() - 1
|
|
counter += 1
|
|
case PkgbuildToken.FunctionEnds:
|
|
end_position = self._io.tell()
|
|
if self.state != self.eof: # type: ignore[attr-defined]
|
|
end_position -= 1 # if we are not at the end of the file, position is _after_ the token
|
|
counter -= 1
|
|
if counter == 0:
|
|
break
|
|
case PkgbuildToken.Comment:
|
|
self.instream.readline()
|
|
|
|
if not 0 < start_position < end_position:
|
|
raise PkgbuildParserError("function body wasn't found")
|
|
|
|
# read the specified interval from source stream
|
|
self._io.seek(start_position - 1) # start from the previous symbol
|
|
# we cannot use :func:`read()` here, because it reads characters, not bytes
|
|
content = ""
|
|
while self._io.tell() != end_position and (next_char := self._io.read(1)):
|
|
content += next_char
|
|
|
|
# special case of the end of file
|
|
if self.state == self.eof: # type: ignore[attr-defined]
|
|
content += self._io.read(1)
|
|
|
|
return content
|
|
|
|
def _parse_token(self, token: str) -> Generator[PkgbuildPatch, None, None]:
|
|
"""
|
|
parse single token to the PKGBUILD field
|
|
|
|
Args:
|
|
token(str): current token
|
|
|
|
Yields:
|
|
PkgbuildPatch: extracted a PKGBUILD node
|
|
"""
|
|
# simple assignment rule
|
|
if m := self._STRING_ASSIGNMENT.match(token):
|
|
key = m.group("key")
|
|
value = m.group("value")
|
|
yield PkgbuildPatch(key, value)
|
|
return
|
|
|
|
if token == PkgbuildToken.Comment:
|
|
self.instream.readline()
|
|
return
|
|
|
|
match self.get_token():
|
|
# array processing. Arrays will be sent as "key=", "(", values, ")"
|
|
case PkgbuildToken.ArrayStarts if m := self._ARRAY_ASSIGNMENT.match(token):
|
|
key = m.group("key")
|
|
value = self._parse_array()
|
|
yield PkgbuildPatch(key, value)
|
|
|
|
# functions processing. Function will be sent as "name", "()", "{", body, "}"
|
|
case PkgbuildToken.FunctionDeclaration if self._FUNCTION_DECLARATION.match(token):
|
|
key = f"{token}{PkgbuildToken.FunctionDeclaration}"
|
|
value = self._parse_function()
|
|
yield PkgbuildPatch(key, value) # this is not mistake, assign to token without ()
|
|
|
|
# special function case, where "(" and ")" are separated tokens, e.g. "pkgver ( )"
|
|
case PkgbuildToken.ArrayStarts if self._FUNCTION_DECLARATION.match(token):
|
|
next_token = self.get_token()
|
|
if next_token == PkgbuildToken.ArrayEnds: # replace closing bracket with "()"
|
|
next_token = PkgbuildToken.FunctionDeclaration
|
|
self.push_token(next_token) # type: ignore[arg-type]
|
|
yield from self._parse_token(token)
|
|
|
|
# some random token received without continuation, lets guess it is empty assignment (i.e. key=)
|
|
case other if other is not None:
|
|
yield from self._parse_token(other)
|
|
|
|
def parse(self) -> Generator[PkgbuildPatch, None, None]:
|
|
"""
|
|
parse source stream and yield parsed entries
|
|
|
|
Yields:
|
|
PkgbuildPatch: extracted a PKGBUILD node
|
|
"""
|
|
for token in self:
|
|
yield from self._parse_token(token)
|