diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index d630da2f..81f79bde 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -21,5 +21,6 @@ jobs: /bin/bash -c "pacman --noconfirm -Syu base-devel python python-pip && \ pip install -e .[web] && \ pip install -e .[check] && \ + pip install -e .[s3] && \ pip install -e .[test] && \ make check tests" diff --git a/CONFIGURING.md b/CONFIGURING.md index 8838af47..050f31bb 100644 --- a/CONFIGURING.md +++ b/CONFIGURING.md @@ -89,10 +89,13 @@ Group name must refer to architecture, e.g. it should be `rsync:x86_64` for x86_ ### `s3:*` groups -Group name must refer to architecture, e.g. it should be `s3:x86_64` for x86_64 architecture. Requires `aws-cli` package to be installed. Do not forget to configure it for user `ahriman`. +Group name must refer to architecture, e.g. it should be `s3:x86_64` for x86_64 architecture. -* `command` - s3 command to run, space separated list of string, required. -* `bucket` - bucket name (e.g. `s3://bucket/path`), string, required. +* `access_key` - AWS access key ID, string, required. +* `bucket` - bucket name (e.g. `bucket`), string, required. +* `chunk_size` - chunk size for calculating entity tags, int, optional, default 8 * 1024 * 1024. +* `region` - bucket region (e.g. `eu-central-1`), string, required. +* `secret_key` - AWS secret access key, string, required. ## `web:*` groups diff --git a/package/archlinux/PKGBUILD b/package/archlinux/PKGBUILD index 88f7112a..a5c30960 100644 --- a/package/archlinux/PKGBUILD +++ b/package/archlinux/PKGBUILD @@ -9,13 +9,13 @@ url="https://github.com/arcan1s/ahriman" license=('GPL3') depends=('devtools' 'git' 'pyalpm' 'python-aur' 'python-srcinfo') makedepends=('python-pip') -optdepends=('aws-cli: sync to s3' - 'breezy: -bzr packages support' +optdepends=('breezy: -bzr packages support' 'darcs: -darcs packages support' 'gnupg: package and repository sign' 'mercurial: -hg packages support' 'python-aiohttp: web server' 'python-aiohttp-jinja2: web server' + 'python-boto3: sync to s3' 'python-jinja: html report generation' 'rsync: sync by using rsync' 'subversion: -svn packages support') diff --git a/package/etc/ahriman.ini b/package/etc/ahriman.ini index 9ccdc825..e3aa397c 100644 --- a/package/etc/ahriman.ini +++ b/package/etc/ahriman.ini @@ -40,7 +40,7 @@ target = command = rsync --archive --compress --partial --delete [s3] -command = aws s3 sync --quiet --delete +chunk_size = 8388608 [web] host = 0.0.0.0 diff --git a/setup.py b/setup.py index f468e077..e669f8a9 100644 --- a/setup.py +++ b/setup.py @@ -77,6 +77,9 @@ setup( "mypy", "pylint", ], + "s3": [ + "boto3", + ], "test": [ "pytest", "pytest-aiohttp", diff --git a/src/ahriman/core/upload/s3.py b/src/ahriman/core/upload/s3.py index 30e22d24..b1c1c6f8 100644 --- a/src/ahriman/core/upload/s3.py +++ b/src/ahriman/core/upload/s3.py @@ -17,24 +17,24 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # +import boto3 # type: ignore +import hashlib + from pathlib import Path -from typing import Iterable +from typing import Any, Dict, Generator, Iterable from ahriman.core.configuration import Configuration from ahriman.core.upload.upload import Upload -from ahriman.core.util import check_output from ahriman.models.package import Package class S3(Upload): """ aws-cli wrapper - :ivar bucket: full bucket name - :ivar command: command arguments for sync + :ivar bucket: boto3 S3 bucket object + :ivar chunk_size: chunk size for calculating checksums """ - _check_output = check_output - def __init__(self, architecture: str, configuration: Configuration) -> None: """ default constructor @@ -42,8 +42,68 @@ class S3(Upload): :param configuration: configuration instance """ Upload.__init__(self, architecture, configuration) - self.bucket = configuration.get("s3", "bucket") - self.command = configuration.getlist("s3", "command") + self.bucket = self.get_bucket(configuration) + self.chunk_size = configuration.getint("s3", "chunk_size", fallback=8 * 1024 * 1024) + + @staticmethod + def calculate_etag(path: Path, chunk_size: int) -> str: + """ + calculate amazon s3 etag + credits to https://teppen.io/2018/10/23/aws_s3_verify_etags/ + :param path: path to local file + :param chunk_size: read chunk size, which depends on client settings + :return: calculated entity tag for local file + """ + md5s = [] + with path.open("rb") as local_file: + for chunk in iter(lambda: local_file.read(chunk_size), b""): + md5s.append(hashlib.md5(chunk)) + + # in case if there is only one chunk it must be just this checksum + # and checksum of joined digest otherwise (including empty list) + checksum = md5s[0] if len(md5s) == 1 else hashlib.md5(b"".join(md5.digest() for md5 in md5s)) + # in case if there are more than one chunk it should be appended with amount of chunks + suffix = f"-{len(md5s)}" if len(md5s) > 1 else "" + return f"{checksum.hexdigest()}{suffix}" + + @staticmethod + def get_bucket(configuration: Configuration) -> Any: + """ + create resource client from configuration + :param configuration: configuration instance + :return: amazon client + """ + client = boto3.resource(service_name="s3", + region_name=configuration.get("s3", "region"), + aws_access_key_id=configuration.get("s3", "access_key"), + aws_secret_access_key=configuration.get("s3", "secret_key")) + return client.Bucket(configuration.get("s3", "bucket")) + + def get_local_files(self, path: Path) -> Dict[Path, str]: + """ + get all local files and their calculated checksums + :param path: local path to sync + :return: map of path object to its checksum + """ + # credits to https://stackoverflow.com/a/64915960 + def walk(directory_path: Path) -> Generator[Path, None, None]: + for element in directory_path.iterdir(): + if element.is_dir(): + yield from walk(element) + continue + yield element + return { + local_file.relative_to(path): self.calculate_etag(local_file, self.chunk_size) + for local_file in walk(path) + } + + def get_remote_objects(self) -> Dict[Path, Any]: + """ + get all remote objects and their checksums + :return: map of path object to the remote s3 object + """ + objects = self.bucket.objects.filter(Prefix=self.architecture) + return {Path(item.key).relative_to(self.architecture): item for item in objects} def sync(self, path: Path, built_packages: Iterable[Package]) -> None: """ @@ -51,5 +111,21 @@ class S3(Upload): :param path: local path to sync :param built_packages: list of packages which has just been built """ - # TODO rewrite to boto, but it is bullshit - S3._check_output(*self.command, str(path), self.bucket, exception=None, logger=self.logger) + remote_objects = self.get_remote_objects() + local_files = self.get_local_files(path) + + # sync to remotes first + for local_file, checksum in local_files.items(): + remote_object = remote_objects.get(local_file) + # 0 and -1 elements are " (double quote) + remote_checksum = remote_object.e_tag[1:-1] if remote_object is not None else None + if remote_checksum == checksum: + continue + remote_path = Path(self.architecture) / local_file + self.bucket.upload_file(str(path / local_file), str(remote_path)) + + # remove files which were removed locally + for local_file, remote_object in remote_objects.items(): + if local_file in local_files: + continue + remote_object.delete() diff --git a/tests/ahriman/core/alpm/test_pacman.py b/tests/ahriman/core/alpm/test_pacman.py index ce14c7c3..3823e3af 100644 --- a/tests/ahriman/core/alpm/test_pacman.py +++ b/tests/ahriman/core/alpm/test_pacman.py @@ -14,4 +14,4 @@ def test_all_packages_with_provides(pacman: Pacman) -> None: """ package list must contain provides packages """ - assert 'sh' in pacman.all_packages() + assert "sh" in pacman.all_packages() diff --git a/tests/ahriman/core/upload/conftest.py b/tests/ahriman/core/upload/conftest.py new file mode 100644 index 00000000..0e490d8c --- /dev/null +++ b/tests/ahriman/core/upload/conftest.py @@ -0,0 +1,22 @@ +import pytest + +from collections import namedtuple +from typing import List +from unittest.mock import MagicMock + +from ahriman.core.configuration import Configuration +from ahriman.core.upload.s3 import S3 + + +_s3_object = namedtuple("s3_object", ["key", "e_tag", "delete"]) + + +@pytest.fixture +def s3(configuration: Configuration) -> S3: + return S3("x86_64", configuration) + + +@pytest.fixture +def s3_remote_objects() -> List[_s3_object]: + delete_mock = MagicMock() + return list(map(lambda item: _s3_object(f"x86_64/{item}", f"\"{item}\"", delete_mock), ["a", "b", "c"])) diff --git a/tests/ahriman/core/upload/test_s3.py b/tests/ahriman/core/upload/test_s3.py index e8bc4a72..04e11631 100644 --- a/tests/ahriman/core/upload/test_s3.py +++ b/tests/ahriman/core/upload/test_s3.py @@ -1,16 +1,98 @@ from pathlib import Path from pytest_mock import MockerFixture +from typing import Any, List +from unittest import mock +from unittest.mock import MagicMock -from ahriman.core.configuration import Configuration from ahriman.core.upload.s3 import S3 -def test_sync(configuration: Configuration, mocker: MockerFixture) -> None: +_chunk_size = 8 * 1024 * 1024 + + +def test_calculate_etag_big(resource_path_root: Path) -> None: + """ + must calculate checksum for path which is more than one chunk + """ + path = resource_path_root / "models" / "big_file_checksum" + assert S3.calculate_etag(path, _chunk_size) == "3b15154eaeed22ae19ae4667d4b98d28-2" + + +def test_calculate_etag_empty(resource_path_root: Path) -> None: + """ + must calculate checksum for empty file correctly + """ + path = resource_path_root / "models" / "empty_file_checksum" + assert S3.calculate_etag(path, _chunk_size) == "d41d8cd98f00b204e9800998ecf8427e" + + +def test_calculate_etag_small(resource_path_root: Path) -> None: + """ + must calculate checksum for path which is single chunk + """ + path = resource_path_root / "models" / "package_ahriman_srcinfo" + assert S3.calculate_etag(path, _chunk_size) == "04e75b4aa0fe6033e711e8ea98e059b2" + + +def test_get_local_files(s3: S3, resource_path_root: Path) -> None: + """ + must get all local files recursively + """ + expected = sorted([ + Path("core/ahriman.ini"), + Path("core/logging.ini"), + Path("models/big_file_checksum"), + Path("models/empty_file_checksum"), + Path("models/package_ahriman_srcinfo"), + Path("models/package_tpacpi-bat-git_srcinfo"), + Path("models/package_yay_srcinfo"), + Path("web/templates/search-line.jinja2"), + Path("web/templates/build-status.jinja2"), + Path("web/templates/repo-index.jinja2"), + Path("web/templates/sorttable.jinja2"), + Path("web/templates/style.jinja2"), + Path("web/templates/search.jinja2"), + ]) + + local_files = list(sorted(s3.get_local_files(resource_path_root).keys())) + assert local_files == expected + + +def test_get_remote_objects(s3: S3, s3_remote_objects: List[Any]) -> None: + """ + must generate list of remote objects by calling boto3 function + """ + expected = {Path(item.key).relative_to(s3.architecture): item for item in s3_remote_objects} + + s3.bucket = MagicMock() + s3.bucket.objects.filter.return_value = s3_remote_objects + + assert s3.get_remote_objects() == expected + + +def test_sync(s3: S3, s3_remote_objects: List[Any], mocker: MockerFixture) -> None: """ must run sync command """ - check_output_mock = mocker.patch("ahriman.core.upload.s3.S3._check_output") + root = Path("path") + local_files = { + Path(item.key.replace("a", "d")): item.e_tag.replace("b", "d").replace("\"", "") + for item in s3_remote_objects + } + remote_objects = {Path(item.key): item for item in s3_remote_objects} + print(local_files) + print(remote_objects) - upload = S3("x86_64", configuration) - upload.sync(Path("path"), []) - check_output_mock.assert_called_once() + local_files_mock = mocker.patch("ahriman.core.upload.s3.S3.get_local_files", return_value=local_files) + remote_objects_mock = mocker.patch("ahriman.core.upload.s3.S3.get_remote_objects", return_value=remote_objects) + upload_mock = s3.bucket = MagicMock() + + s3.sync(root, []) + + local_files_mock.assert_called_once() + remote_objects_mock.assert_called_once() + upload_mock.upload_file.assert_has_calls([ + mock.call(str(root / s3.architecture / "b"), f"{s3.architecture}/{s3.architecture}/b"), + mock.call(str(root / s3.architecture / "d"), f"{s3.architecture}/{s3.architecture}/d"), + ], any_order=True) + remote_objects[Path("x86_64/a")].delete.assert_called_once() diff --git a/tests/testresources/core/ahriman.ini b/tests/testresources/core/ahriman.ini index 0268af71..96f63bc3 100644 --- a/tests/testresources/core/ahriman.ini +++ b/tests/testresources/core/ahriman.ini @@ -48,8 +48,10 @@ command = rsync --archive --verbose --compress --partial --delete remote = [s3] -bucket = -command = aws s3 sync --quiet --delete +access_key = +bucket = bucket +region = eu-central-1 +secret_key = [web] host = 0.0.0.0 diff --git a/tests/testresources/models/big_file_checksum b/tests/testresources/models/big_file_checksum new file mode 100644 index 00000000..122c01a4 Binary files /dev/null and b/tests/testresources/models/big_file_checksum differ diff --git a/tests/testresources/models/empty_file_checksum b/tests/testresources/models/empty_file_checksum new file mode 100644 index 00000000..e69de29b