sync_kiwix/sync_kiwix.py

262 lines
7.6 KiB
Python
Raw Permalink Normal View History

2024-09-13 20:58:49 +02:00
#!/usr/bin/env python3
import sys
import os
import datetime
import argparse
import requests
import re
import json
import hashlib
import base64
from urllib.parse import urljoin
from tqdm import tqdm
KIWIX_BASE_URL = "https://download.kiwix.org/zim/"
2024-09-14 02:48:29 +02:00
2024-09-13 20:58:49 +02:00
class CacheException(Exception):
pass
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
class NoMatchException(Exception):
def __init__(self, message: str, wiki: str):
super().__init__(message)
self.wiki = wiki
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
class DownloadException(Exception):
pass
2024-09-14 02:48:29 +02:00
2024-09-13 20:58:49 +02:00
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="sync_kiwix",
description="Synchronize zim files for kiwix",
)
2024-09-14 02:48:29 +02:00
parser.add_argument("wiki", type=str, nargs="+", help="Wikis to synchronize")
parser.add_argument("destination", type=str, help="Destination directory")
parser.add_argument(
"--cache",
action=argparse.BooleanOptionalAction,
default=True,
help="Use filelist cache (default: yes)",
)
parser.add_argument(
"--cache-file",
type=str,
default=os.path.join(os.getcwd(), ".cache"),
help="Path to cache file (default: ./.cache)",
)
parser.add_argument(
"--cache-max-age",
type=int,
default=1,
help="Maximum age of the cached file list in days",
)
2024-09-13 20:58:49 +02:00
return parser.parse_args()
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
def load_cache(cache_file: str) -> list:
2024-09-13 20:58:49 +02:00
try:
2024-09-14 02:48:29 +02:00
with open(file=cache_file, mode="r") as file:
2024-09-13 20:58:49 +02:00
return json.load(file)
except json.decoder.JSONDecodeError:
2024-09-14 02:48:29 +02:00
raise CacheException("Could not decode JSON")
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
def save_cache(cache_file: str, zim_list: list):
2024-09-14 02:48:29 +02:00
with open(file=cache_file, mode="w") as file:
2024-09-14 02:39:18 +02:00
json.dump(zim_list, file)
2024-09-13 20:58:49 +02:00
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
def retreive_zim_list(session: requests.Session, url: str) -> list:
try:
response = session.get(url)
response.raise_for_status()
except requests.exceptions.HTTPError as err:
raise DownloadException(str(err))
2024-09-13 20:58:49 +02:00
directories = re.findall(r'<a href="(\w+\/)">', response.text)
2024-09-14 00:40:17 +02:00
files = re.findall(r'<a href="([\w\-\.]+\.zim)">', response.text)
2024-09-13 20:58:49 +02:00
result = list()
for directory in directories:
2024-09-14 02:39:18 +02:00
result += retreive_zim_list(session=session, url=urljoin(url, directory))
2024-09-13 20:58:49 +02:00
for file in files:
2024-09-14 02:39:18 +02:00
file_base, file_extension = os.path.splitext(file)
2024-09-14 02:48:29 +02:00
file_base_without_date, file_date = file_base.rsplit("_", 1)
result.append(
{
"url": urljoin(url, file),
"url_base": url,
"file": file,
"file_base": file_base,
"file_date": file_date,
"file_extension": file_extension,
"file_base_without_date": file_base_without_date,
}
)
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
return result
2024-09-13 20:58:49 +02:00
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
def print_fuzzy_matches(wiki: str, zim_list: list):
print(f"Here is a list of zim files similar to your request:", file=sys.stderr)
2024-09-14 01:17:30 +02:00
2024-09-14 02:39:18 +02:00
for zim in zim_list:
if wiki in zim["url"]:
2024-09-14 02:48:29 +02:00
print(f" - {zim['file_base_without_date']} ({zim['url']})", file=sys.stderr)
2024-09-14 02:39:18 +02:00
def generate_download_list(wikis: list, zim_list: list) -> list:
result = list()
2024-09-13 20:58:49 +02:00
for wiki in wikis:
2024-09-14 01:17:30 +02:00
2024-09-14 02:39:18 +02:00
current_zim = None
for zim in zim_list:
if wiki == zim["file_base_without_date"]:
if current_zim is None:
current_zim = zim
continue
2024-09-14 02:48:29 +02:00
current_zim_date = datetime.datetime.strptime(
current_zim["file_date"], "%Y-%m"
)
2024-09-14 02:39:18 +02:00
zim_date = datetime.datetime.strptime(zim["file_date"], "%Y-%m")
if zim_date > current_zim_date:
current_zim = zim
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
if current_zim is None:
raise NoMatchException(f"Could not find any matches for {wiki}", wiki=wiki)
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
result.append(current_zim)
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
return result
2024-09-13 20:58:49 +02:00
2024-09-14 02:48:29 +02:00
2024-09-13 20:58:49 +02:00
def check_is_newer(path: str, last_modified: str) -> bool:
try:
mtime_local = datetime.datetime.fromtimestamp(os.path.getmtime(path))
except FileNotFoundError:
return True
mtime_remote = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
return mtime_remote.date() > mtime_local.date()
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
def download_zim(session: requests.Session, zim: dict, dst: str):
# Get sha256 digest from server
try:
response = session.head(url=zim["url"])
response.raise_for_status()
except requests.exceptions.HTTPError as err:
raise DownloadException(str(err))
2024-09-13 20:58:49 +02:00
digests = response.headers.get("Digest").split(",")
2024-09-14 02:39:18 +02:00
2024-09-13 20:58:49 +02:00
for digest in digests:
method, data = digest.strip().split("=", 1)
if method == "SHA-256":
digest_value = base64.b64decode(data)
break
2024-09-14 01:37:07 +02:00
else:
2024-09-14 02:48:29 +02:00
raise DownloadException(
f"Could not get SHA-256 digest for {zim['file']}. Aborting."
)
2024-09-13 20:58:49 +02:00
sha256 = hashlib.sha256()
2024-09-14 02:39:18 +02:00
try:
with session.get(url=zim["url"], stream=True) as response:
response.raise_for_status()
if not check_is_newer(dst, response.headers.get("Last-Modified")):
2024-09-14 02:48:29 +02:00
print(
f"{zim['file_base_without_date']} was not updated on server. Skipping..."
)
2024-09-14 02:39:18 +02:00
return
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
total_size = int(response.headers.get("content-length", 0))
block_size = 1024
2024-09-14 02:48:29 +02:00
with tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=zim["file_base_without_date"],
) as progress_bar:
with open(dst + ".part", "wb") as file:
2024-09-14 02:39:18 +02:00
for block in response.iter_content(block_size):
progress_bar.update(len(block))
file.write(block)
sha256.update(block)
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
except requests.exceptions.HTTPError as err:
raise DownloadException(str(err))
2024-09-13 20:58:49 +02:00
if digest_value != sha256.digest():
2024-09-14 02:48:29 +02:00
raise DownloadException(
f"Checksum Error for {zim['file_base_without_date']}. Aborting."
)
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
sha256_hex = sha256.hexdigest()
2024-09-14 02:48:29 +02:00
with open(dst + ".sha256sum", "w") as file:
2024-09-14 02:39:18 +02:00
file.write(f"{sha256_hex} {dst}\n")
2024-09-14 00:07:56 +02:00
2024-09-14 02:48:29 +02:00
os.rename(dst + ".part", dst)
2024-09-13 20:58:49 +02:00
def main():
args = parse_args()
# Create Session
session = requests.Session()
2024-09-14 02:39:18 +02:00
# Read ZIM list from cache if enabled
zim_list = None
2024-09-14 00:40:17 +02:00
if args.cache == True:
try:
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(args.cache_file))
2024-09-14 02:48:29 +02:00
if datetime.datetime.now() - mtime < datetime.timedelta(
days=args.cache_max_age
):
2024-09-14 02:39:18 +02:00
zim_list = load_cache(cache_file=args.cache_file)
2024-09-14 00:40:17 +02:00
except (FileNotFoundError, CacheException):
pass
2024-09-14 02:39:18 +02:00
# Retreive new list, if no cached file was loaded
if zim_list is None:
try:
zim_list = retreive_zim_list(session=session, url=KIWIX_BASE_URL)
except DownloadException as err:
raise SystemExit(str(err))
2024-09-14 02:48:29 +02:00
2024-09-14 02:39:18 +02:00
save_cache(cache_file=args.cache_file, zim_list=zim_list)
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
# Get download list
try:
download_list = generate_download_list(wikis=args.wiki, zim_list=zim_list)
except NoMatchException as err:
print(f"{str(err)}\n", file=sys.stderr)
print_fuzzy_matches(wiki=err.wiki, zim_list=zim_list)
raise SystemExit("\nAborting.")
2024-09-13 20:58:49 +02:00
2024-09-14 02:39:18 +02:00
# Download zim files
try:
for zim in download_list:
2024-09-14 02:48:29 +02:00
dst_path = os.path.join(
args.destination, zim["file_base_without_date"] + zim["file_extension"]
)
2024-09-14 02:39:18 +02:00
download_zim(session=session, zim=zim, dst=dst_path)
except DownloadException as err:
raise SystemExit(str(err))
2024-09-14 02:48:29 +02:00
2024-09-13 20:58:49 +02:00
if __name__ == "__main__":
2024-09-14 02:48:29 +02:00
main()