feat: optimize archive reading

Instead of trying to load every database and look for files, this commit
introduces the optimization in which, the service loads packages first,
groups them by database and load files later.

In some cases it significantly descreases times for loading files
This commit is contained in:
Evgenii Alekseev 2024-08-14 14:45:01 +03:00
parent fd3c6343f1
commit f44fa19c42
5 changed files with 92 additions and 80 deletions

View File

@ -17,6 +17,7 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# #
import itertools
import shutil import shutil
import tarfile import tarfile
@ -177,39 +178,48 @@ class Pacman(LazyLogging):
PacmanDatabase(database, self.configuration).sync(force=force) PacmanDatabase(database, self.configuration).sync(force=force)
transaction.release() transaction.release()
def files(self, packages: Iterable[str] | None = None) -> dict[str, set[str]]: def files(self, packages: Iterable[str]) -> dict[str, set[str]]:
""" """
extract list of known packages from the databases extract list of known packages from the databases
Args: Args:
packages(Iterable[str] | None, optional): filter by package names (Default value = None) packages(Iterable[str]): filter by package names
Returns: Returns:
dict[str, set[str]]: map of package name to its list of files dict[str, set[str]]: map of package name to its list of files
""" """
packages = packages or [] def extract(tar: tarfile.TarFile, package_names: dict[str, str]) -> Generator[tuple[str, set[str]], None, None]:
for package_name, version in package_names.items():
def extract(tar: tarfile.TarFile) -> Generator[tuple[str, set[str]], None, None]: path = Path(f"{package_name}-{version}") / "files"
for descriptor in filter(lambda info: info.path.endswith("/files"), tar.getmembers()): try:
package, *_ = str(Path(descriptor.path).parent).rsplit("-", 2) content = tar.extractfile(str(path))
if packages and package not in packages: except KeyError:
continue # skip unused packages # in case if database and its files has been desync somehow, the extractfile will raise
content = tar.extractfile(descriptor) # KeyError because the entry doesn't exist
content = None
if content is None: if content is None:
continue continue
# this is just array of files, however, the directories are with trailing slash, # this is just array of files, however, the directories are with trailing slash,
# which previously has been removed by the conversion to ``pathlib.Path`` # which previously has been removed by the conversion to ``pathlib.Path``
files = {filename.decode("utf8").rstrip().removesuffix("/") for filename in content.readlines()} files = {filename.decode("utf8").rstrip().removesuffix("/") for filename in content.readlines()}
yield package_name, files
yield package, files # sort is required for the following group by operation
descriptors = sorted(
(package for package_name in packages for package in self.package(package_name)),
key=lambda package: package.db.name
)
result: dict[str, set[str]] = {} result: dict[str, set[str]] = {}
for database in self.handle.get_syncdbs(): for database_name, pacman_packages in itertools.groupby(descriptors, lambda package: package.db.name):
database_file = self.repository_paths.pacman / "sync" / f"{database.name}.files.tar.gz" database_file = self.repository_paths.pacman / "sync" / f"{database_name}.files.tar.gz"
if not database_file.is_file(): if not database_file.is_file():
continue # no database file found continue # no database file found
package_names = {package.name: package.version for package in pacman_packages}
with tarfile.open(database_file, "r:gz") as archive: with tarfile.open(database_file, "r:gz") as archive:
result.update(extract(archive)) result.update(extract(archive, package_names))
return result return result

View File

@ -4,7 +4,7 @@ import pytest
from pathlib import Path from pathlib import Path
from pytest_mock import MockerFixture from pytest_mock import MockerFixture
from typing import Any, TypeVar from typing import Any, TypeVar
from unittest.mock import MagicMock from unittest.mock import MagicMock, PropertyMock
from ahriman.core.alpm.pacman import Pacman from ahriman.core.alpm.pacman import Pacman
from ahriman.core.alpm.remote import AUR from ahriman.core.alpm.remote import AUR
@ -476,6 +476,41 @@ def passwd() -> MagicMock:
return passwd return passwd
@pytest.fixture
def pyalpm_package_ahriman(aur_package_ahriman: AURPackage) -> MagicMock:
"""
mock object for pyalpm package
Args:
aur_package_ahriman(AURPackage): package fixture
Returns:
MagicMock: pyalpm package mock
"""
mock = MagicMock()
db = type(mock).db = MagicMock()
type(mock).base = PropertyMock(return_value=aur_package_ahriman.package_base)
type(mock).builddate = PropertyMock(
return_value=aur_package_ahriman.last_modified.replace(tzinfo=datetime.timezone.utc).timestamp())
type(mock).conflicts = PropertyMock(return_value=aur_package_ahriman.conflicts)
type(db).name = PropertyMock(return_value="aur")
type(mock).depends = PropertyMock(return_value=aur_package_ahriman.depends)
type(mock).desc = PropertyMock(return_value=aur_package_ahriman.description)
type(mock).licenses = PropertyMock(return_value=aur_package_ahriman.license)
type(mock).makedepends = PropertyMock(return_value=aur_package_ahriman.make_depends)
type(mock).name = PropertyMock(return_value=aur_package_ahriman.name)
type(mock).optdepends = PropertyMock(return_value=aur_package_ahriman.opt_depends)
type(mock).checkdepends = PropertyMock(return_value=aur_package_ahriman.check_depends)
type(mock).packager = PropertyMock(return_value="packager")
type(mock).provides = PropertyMock(return_value=aur_package_ahriman.provides)
type(mock).version = PropertyMock(return_value=aur_package_ahriman.version)
type(mock).url = PropertyMock(return_value=aur_package_ahriman.url)
type(mock).groups = PropertyMock(return_value=aur_package_ahriman.groups)
return mock
@pytest.fixture @pytest.fixture
def remote_source() -> RemoteSource: def remote_source() -> RemoteSource:
""" """

View File

@ -1,3 +1,4 @@
import pyalpm
import pytest import pytest
import tarfile import tarfile
@ -175,31 +176,12 @@ def test_database_sync_forced(pacman: Pacman, mocker: MockerFixture) -> None:
sync_mock.assert_called_once_with(force=True) sync_mock.assert_called_once_with(force=True)
def test_files(pacman: Pacman, package_ahriman: Package, mocker: MockerFixture, resource_path_root: Path) -> None: def test_files_package(pacman: Pacman, package_ahriman: Package, pyalpm_package_ahriman: pyalpm.Package,
""" mocker: MockerFixture, resource_path_root: Path) -> None:
must load files from databases
"""
handle_mock = MagicMock()
handle_mock.get_syncdbs.return_value = [MagicMock()]
pacman.handle = handle_mock
tarball = resource_path_root / "core" / "arcanisrepo.files.tar.gz"
with tarfile.open(tarball, "r:gz") as fd:
mocker.patch("pathlib.Path.is_file", return_value=True)
open_mock = mocker.patch("ahriman.core.alpm.pacman.tarfile.open", return_value=fd)
files = pacman.files()
assert len(files) == 2
assert package_ahriman.base in files
assert "usr/bin/ahriman" in files[package_ahriman.base]
open_mock.assert_called_once_with(pytest.helpers.anyvar(int), "r:gz")
def test_files_package(pacman: Pacman, package_ahriman: Package, mocker: MockerFixture,
resource_path_root: Path) -> None:
""" """
must load files only for the specified package must load files only for the specified package
""" """
mocker.patch("ahriman.core.alpm.pacman.Pacman.package", return_value=[pyalpm_package_ahriman])
handle_mock = MagicMock() handle_mock = MagicMock()
handle_mock.get_syncdbs.return_value = [MagicMock()] handle_mock.get_syncdbs.return_value = [MagicMock()]
pacman.handle = handle_mock pacman.handle = handle_mock
@ -210,34 +192,35 @@ def test_files_package(pacman: Pacman, package_ahriman: Package, mocker: MockerF
mocker.patch("pathlib.Path.is_file", return_value=True) mocker.patch("pathlib.Path.is_file", return_value=True)
mocker.patch("ahriman.core.alpm.pacman.tarfile.open", return_value=fd) mocker.patch("ahriman.core.alpm.pacman.tarfile.open", return_value=fd)
files = pacman.files(package_ahriman.base) files = pacman.files([package_ahriman.base])
assert len(files) == 1 assert len(files) == 1
assert package_ahriman.base in files assert package_ahriman.base in files
def test_files_skip(pacman: Pacman, mocker: MockerFixture) -> None: def test_files_skip(pacman: Pacman, pyalpm_package_ahriman: pyalpm.Package, mocker: MockerFixture) -> None:
""" """
must return empty list if no database found must return empty list if no database found
""" """
mocker.patch("ahriman.core.alpm.pacman.Pacman.package", return_value=[pyalpm_package_ahriman])
handle_mock = MagicMock() handle_mock = MagicMock()
handle_mock.get_syncdbs.return_value = [MagicMock()] handle_mock.get_syncdbs.return_value = [MagicMock()]
pacman.handle = handle_mock pacman.handle = handle_mock
mocker.patch("pathlib.Path.is_file", return_value=False) mocker.patch("pathlib.Path.is_file", return_value=False)
assert not pacman.files() assert not pacman.files([pyalpm_package_ahriman.name])
def test_files_no_content(pacman: Pacman, mocker: MockerFixture) -> None: def test_files_no_content(pacman: Pacman, pyalpm_package_ahriman: pyalpm.Package, mocker: MockerFixture) -> None:
""" """
must skip package if no content can be loaded must skip package if no content can be loaded
""" """
mocker.patch("ahriman.core.alpm.pacman.Pacman.package", return_value=[pyalpm_package_ahriman])
handle_mock = MagicMock() handle_mock = MagicMock()
handle_mock.get_syncdbs.return_value = [MagicMock()] handle_mock.get_syncdbs.return_value = [MagicMock()]
pacman.handle = handle_mock pacman.handle = handle_mock
tar_mock = MagicMock() tar_mock = MagicMock()
tar_mock.getmembers.return_value = [MagicMock()]
tar_mock.extractfile.return_value = None tar_mock.extractfile.return_value = None
open_mock = MagicMock() open_mock = MagicMock()
@ -246,7 +229,28 @@ def test_files_no_content(pacman: Pacman, mocker: MockerFixture) -> None:
mocker.patch("pathlib.Path.is_file", return_value=True) mocker.patch("pathlib.Path.is_file", return_value=True)
mocker.patch("ahriman.core.alpm.pacman.tarfile.open", return_value=open_mock) mocker.patch("ahriman.core.alpm.pacman.tarfile.open", return_value=open_mock)
assert not pacman.files() assert not pacman.files([pyalpm_package_ahriman.name])
def test_files_no_entry(pacman: Pacman, pyalpm_package_ahriman: pyalpm.Package, mocker: MockerFixture) -> None:
"""
must skip package if it wasn't found in the archive
"""
mocker.patch("ahriman.core.alpm.pacman.Pacman.package", return_value=[pyalpm_package_ahriman])
handle_mock = MagicMock()
handle_mock.get_syncdbs.return_value = [MagicMock()]
pacman.handle = handle_mock
tar_mock = MagicMock()
tar_mock.extractfile.side_effect = KeyError()
open_mock = MagicMock()
open_mock.__enter__.return_value = tar_mock
mocker.patch("pathlib.Path.is_file", return_value=True)
mocker.patch("ahriman.core.alpm.pacman.tarfile.open", return_value=open_mock)
assert not pacman.files([pyalpm_package_ahriman.name])
def test_package(pacman: Pacman) -> None: def test_package(pacman: Pacman) -> None:

View File

@ -1,4 +1,3 @@
import datetime
import pytest import pytest
from typing import Any from typing import Any
@ -8,7 +7,6 @@ from pytest_mock import MockerFixture
from ahriman import __version__ from ahriman import __version__
from ahriman.core.alpm.pacman import Pacman from ahriman.core.alpm.pacman import Pacman
from ahriman.core.alpm.remote import AUR from ahriman.core.alpm.remote import AUR
from ahriman.models.aur_package import AURPackage
from ahriman.models.build_status import BuildStatus, BuildStatusEnum from ahriman.models.build_status import BuildStatus, BuildStatusEnum
from ahriman.models.counters import Counters from ahriman.models.counters import Counters
from ahriman.models.filesystem_package import FilesystemPackage from ahriman.models.filesystem_package import FilesystemPackage
@ -134,41 +132,6 @@ def pyalpm_handle(pyalpm_package_ahriman: MagicMock) -> MagicMock:
return mock return mock
@pytest.fixture
def pyalpm_package_ahriman(aur_package_ahriman: AURPackage) -> MagicMock:
"""
mock object for pyalpm package
Args:
aur_package_ahriman(AURPackage): package fixture
Returns:
MagicMock: pyalpm package mock
"""
mock = MagicMock()
db = type(mock).db = MagicMock()
type(mock).base = PropertyMock(return_value=aur_package_ahriman.package_base)
type(mock).builddate = PropertyMock(
return_value=aur_package_ahriman.last_modified.replace(tzinfo=datetime.timezone.utc).timestamp())
type(mock).conflicts = PropertyMock(return_value=aur_package_ahriman.conflicts)
type(db).name = PropertyMock(return_value="aur")
type(mock).depends = PropertyMock(return_value=aur_package_ahriman.depends)
type(mock).desc = PropertyMock(return_value=aur_package_ahriman.description)
type(mock).licenses = PropertyMock(return_value=aur_package_ahriman.license)
type(mock).makedepends = PropertyMock(return_value=aur_package_ahriman.make_depends)
type(mock).name = PropertyMock(return_value=aur_package_ahriman.name)
type(mock).optdepends = PropertyMock(return_value=aur_package_ahriman.opt_depends)
type(mock).checkdepends = PropertyMock(return_value=aur_package_ahriman.check_depends)
type(mock).packager = PropertyMock(return_value="packager")
type(mock).provides = PropertyMock(return_value=aur_package_ahriman.provides)
type(mock).version = PropertyMock(return_value=aur_package_ahriman.version)
type(mock).url = PropertyMock(return_value=aur_package_ahriman.url)
type(mock).groups = PropertyMock(return_value=aur_package_ahriman.groups)
return mock
@pytest.fixture @pytest.fixture
def pyalpm_package_description_ahriman(package_description_ahriman: PackageDescription) -> MagicMock: def pyalpm_package_description_ahriman(package_description_ahriman: PackageDescription) -> MagicMock:
""" """