commit 3ccb39b970f80f0328fc32c0a86e56b395c00fe5 Author: Jan-Marten Brüggemann Date: Fri Sep 13 20:58:49 2024 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..04fb59e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.cache +.conda diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..c86352a --- /dev/null +++ b/environment.yml @@ -0,0 +1,43 @@ +name: sync_kiwix +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - astroid=3.2.4=py39h06a4308_0 + - brotli-python=1.0.9=py39h6a678d5_8 + - ca-certificates=2024.7.2=h06a4308_0 + - certifi=2024.8.30=py39h06a4308_0 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - dill=0.3.8=py39h06a4308_0 + - idna=3.7=py39h06a4308_0 + - isort=5.13.2=py39h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - mccabe=0.7.0=pyhd3eb1b0_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.15=h5eee18b_0 + - pip=24.2=py39h06a4308_0 + - platformdirs=3.10.0=py39h06a4308_0 + - pylint=3.2.7=py39h06a4308_0 + - pysocks=1.7.1=py39h06a4308_0 + - python=3.9.19=h955ad1f_1 + - readline=8.2=h5eee18b_0 + - requests=2.32.3=py39h06a4308_0 + - setuptools=72.1.0=py39h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h39e8969_0 + - tomli=2.0.1=py39h06a4308_0 + - tomlkit=0.11.1=py39h06a4308_0 + - tqdm=4.66.5=py39h2f386ee_0 + - typing-extensions=4.11.0=py39h06a4308_0 + - typing_extensions=4.11.0=py39h06a4308_0 + - tzdata=2024a=h04d1e81_0 + - urllib3=2.2.2=py39h06a4308_0 + - wheel=0.44.0=py39h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 +prefix: /home/fusselkater/.conda/envs/sync_kiwix diff --git a/sync_kiwix.py b/sync_kiwix.py new file mode 100644 index 0000000..ae9baba --- /dev/null +++ b/sync_kiwix.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 + +import sys +import os +import datetime +import argparse +import requests +import re +import json +import hashlib +import base64 +from urllib.parse import urljoin +from tqdm import tqdm + +KIWIX_BASE_URL = "https://download.kiwix.org/zim/" + +class CacheException(Exception): + pass +class MultipleFileException(Exception): + pass + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="sync_kiwix", + description="Synchronize zim files for kiwix", + ) + parser.add_argument('wiki', type=str, nargs="+", help="Wikis to synchronize") + parser.add_argument('destination', type=str, help="Destination directory") + parser.add_argument('--cache', action=argparse.BooleanOptionalAction, default=True, help="Use filelist cache (default: yes)") + parser.add_argument('--cache-file', type=str, default=os.path.join(os.getcwd(), ".cache"), help="Path to cache file (default: ./.cache)") + + return parser.parse_args() + +def load_cached_filelist(cache_file: str) -> list: + try: + with open(file=cache_file, mode='r') as file: + return json.load(file) + except json.decoder.JSONDecodeError: + raise CacheException('Could not decode JSON') + +def save_cached_filelist(cache_file: str, filelist: list): + with open(file=cache_file, mode='w') as file: + json.dump(filelist, file) + +def retreive_filelist(session: requests.Session, url: str, cache_file: str = "", cache_max_age: datetime.timedelta = datetime.timedelta(days=1)) -> list: + + if cache_file != "": + try: + mtime = datetime.datetime.fromtimestamp(os.path.getmtime(cache_file)) + if datetime.datetime.now() - mtime < cache_max_age: + return load_cached_filelist(cache_file=cache_file) + except (FileNotFoundError, CacheException): + pass + + response = session.get(url) + response.raise_for_status() + + directories = re.findall(r'', response.text) + files = re.findall(r'', response.text) + + result = list() + + for directory in directories: + result += retreive_filelist(session=session, url=urljoin(url, directory)) + + for file in files: + result.append([url, file]) + + if cache_file != "": + save_cached_filelist(cache_file=cache_file, filelist=result) + + return result + +def find_wiki_files(filelist: list, wiki: str) -> list: + result = list() + for file in filelist: + url = urljoin(file[0], file[1]) + if wiki in url: + result.append(file) + return result + +def error_multiple_files(wiki: str, wiki_files: str): + print(f"{wiki} has multiple matches. Please specify your input more precisely.\n", file=sys.stderr) + print(f"{wiki} matched to:", file=sys.stderr) + + for file in wiki_files: + url = urljoin(file[0], file[1]) + print(f" - {url}", file=sys.stderr) + + raise SystemExit("Aborting.") + +def get_download_candidates(wiki_files: list, wiki: str) -> dict: + candidates = list() + + # Check if results are unique + path0 = None + file_base0 = None + + for wiki_file in wiki_files: + path = wiki_file[0] + file = wiki_file[1] + + if path0 is None: + path0 = path + if path0 != path: + raise MultipleFileException(wiki) + + + file_name, file_extension = os.path.splitext(file) + file_base, file_date = file_name.rsplit('_', 1) + + if file_base0 is None: + file_base0 = file_base + if file_base0 != file_base: + raise MultipleFileException(wiki) + + candidates.append(( + path, + file, + file_name, + file_extension, + file_base, + file_date + )) + + return candidates + +def get_wiki_files(wikis: list, filelist: list) -> list: + download_files = list() + + for wiki in wikis: + wiki_files = find_wiki_files(filelist=filelist, wiki=wiki) + try: + candidates = get_download_candidates(wiki_files=wiki_files, wiki=wiki) + except MultipleFileException: + error_multiple_files(wiki=wiki, wiki_files=wiki_files) + + # Get most current candidate + candidate0 = None + for candidate in candidates: + if candidate0 is None: + candidate0 = candidate + continue + + candidate0_date = datetime.datetime.strptime(candidate0[5], "%Y-%m") + candidate_date = datetime.datetime.strptime(candidate[5], "%Y-%m") + + if candidate_date > candidate0_date: + candidate0 = candidate + + download_files.append(( + wiki, + candidate0[3], + urljoin(candidate0[0], candidate0[1]), + candidate0[1] + )) + + return download_files + +def check_is_newer(path: str, last_modified: str) -> bool: + try: + mtime_local = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + except FileNotFoundError: + return True + + mtime_remote = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z") + return mtime_remote.date() > mtime_local.date() + +def download_wiki(session: requests.Session, title: str, src: str, dst: str): + + # Get digest + response = session.head(url=src) + digests = response.headers.get("Digest").split(",") + digest_value = None + for digest in digests: + method, data = digest.strip().split("=", 1) + if method == "SHA-256": + digest_value = base64.b64decode(data) + break + + sha256 = hashlib.sha256() + with session.get(url=src, stream=True) as response: + response.raise_for_status() + + if not check_is_newer(dst, response.headers.get("Last-Modified")): + print(f"{title} was not updated. Skipping...") + return + + total_size = int(response.headers.get("content-length", 0)) + block_size = 1024 + with tqdm(total=total_size, unit="B", unit_scale=True, desc=title) as progress_bar: + with open(dst + '.part', "wb") as file: + for block in response.iter_content(block_size): + progress_bar.update(len(block)) + file.write(block) + sha256.update(block) + + if digest_value != sha256.digest(): + raise SystemExit(f"Checksum Error for {title}. Aborting.") + + os.rename(dst + '.part', dst) + +def main(): + args = parse_args() + + # Create Session + session = requests.Session() + + # Get Filelist + filelist = retreive_filelist(session=session, url=KIWIX_BASE_URL, cache_file=args.cache_file if args.cache else "") + + # Get downlaod files list + wiki_files = get_wiki_files(wikis=args.wiki, filelist=filelist) + + # Download files + for wiki_file in wiki_files: + file_path = os.path.join(args.destination, wiki_file[0] + wiki_file[1]) + download_wiki(session=session, title=wiki_file[3], src=wiki_file[2], dst=file_path) + +if __name__ == "__main__": + main() \ No newline at end of file