refactoring
This commit is contained in:
parent
43f6ef9429
commit
12be2f9c45
1 changed files with 111 additions and 102 deletions
213
sync_kiwix.py
213
sync_kiwix.py
|
@ -17,6 +17,14 @@ KIWIX_BASE_URL = "https://download.kiwix.org/zim/"
|
||||||
class CacheException(Exception):
|
class CacheException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class NoMatchException(Exception):
|
||||||
|
def __init__(self, message: str, wiki: str):
|
||||||
|
super().__init__(message)
|
||||||
|
self.wiki = wiki
|
||||||
|
|
||||||
|
class DownloadException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog="sync_kiwix",
|
prog="sync_kiwix",
|
||||||
|
@ -26,102 +34,82 @@ def parse_args() -> argparse.Namespace:
|
||||||
parser.add_argument('destination', type=str, help="Destination directory")
|
parser.add_argument('destination', type=str, help="Destination directory")
|
||||||
parser.add_argument('--cache', action=argparse.BooleanOptionalAction, default=True, help="Use filelist cache (default: yes)")
|
parser.add_argument('--cache', action=argparse.BooleanOptionalAction, default=True, help="Use filelist cache (default: yes)")
|
||||||
parser.add_argument('--cache-file', type=str, default=os.path.join(os.getcwd(), ".cache"), help="Path to cache file (default: ./.cache)")
|
parser.add_argument('--cache-file', type=str, default=os.path.join(os.getcwd(), ".cache"), help="Path to cache file (default: ./.cache)")
|
||||||
|
parser.add_argument('--cache-max-age', type=int, default=1, help="Maximum age of the cached file list in days")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
def load_cached_filelist(cache_file: str) -> list:
|
def load_cache(cache_file: str) -> list:
|
||||||
try:
|
try:
|
||||||
with open(file=cache_file, mode='r') as file:
|
with open(file=cache_file, mode='r') as file:
|
||||||
return json.load(file)
|
return json.load(file)
|
||||||
except json.decoder.JSONDecodeError:
|
except json.decoder.JSONDecodeError:
|
||||||
raise CacheException('Could not decode JSON')
|
raise CacheException('Could not decode JSON')
|
||||||
|
|
||||||
def save_cached_filelist(cache_file: str, filelist: list):
|
def save_cache(cache_file: str, zim_list: list):
|
||||||
with open(file=cache_file, mode='w') as file:
|
with open(file=cache_file, mode='w') as file:
|
||||||
json.dump(filelist, file)
|
json.dump(zim_list, file)
|
||||||
|
|
||||||
def retreive_filelist(session: requests.Session, url: str) -> list:
|
def retreive_zim_list(session: requests.Session, url: str) -> list:
|
||||||
response = session.get(url)
|
try:
|
||||||
response.raise_for_status()
|
response = session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.HTTPError as err:
|
||||||
|
raise DownloadException(str(err))
|
||||||
|
|
||||||
directories = re.findall(r'<a href="(\w+\/)">', response.text)
|
directories = re.findall(r'<a href="(\w+\/)">', response.text)
|
||||||
files = re.findall(r'<a href="([\w\-\.]+\.zim)">', response.text)
|
files = re.findall(r'<a href="([\w\-\.]+\.zim)">', response.text)
|
||||||
|
|
||||||
result = list()
|
result = list()
|
||||||
|
|
||||||
for directory in directories:
|
for directory in directories:
|
||||||
result += retreive_filelist(session=session, url=urljoin(url, directory))
|
result += retreive_zim_list(session=session, url=urljoin(url, directory))
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
result.append([url, file])
|
|
||||||
|
file_base, file_extension = os.path.splitext(file)
|
||||||
|
file_base_without_date, file_date = file_base.rsplit('_', 1)
|
||||||
|
|
||||||
|
result.append({
|
||||||
|
"url": urljoin(url, file),
|
||||||
|
"url_base": url,
|
||||||
|
"file": file,
|
||||||
|
"file_base": file_base,
|
||||||
|
"file_date": file_date,
|
||||||
|
"file_extension": file_extension,
|
||||||
|
"file_base_without_date": file_base_without_date
|
||||||
|
})
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_download_candidates(filelist: list, wiki: str) -> dict:
|
def print_fuzzy_matches(wiki: str, zim_list: list):
|
||||||
candidates = list()
|
print(f"Here is a list of zim files similar to your request:", file=sys.stderr)
|
||||||
|
|
||||||
for file_item in filelist:
|
for zim in zim_list:
|
||||||
path = file_item[0]
|
if wiki in zim["url"]:
|
||||||
file = file_item[1]
|
print(f" - {zim['file_base_without_date']} ({zim['url']})", file=sys.stderr)
|
||||||
|
|
||||||
file_name, file_extension = os.path.splitext(file)
|
|
||||||
file_base, file_date = file_name.rsplit('_', 1)
|
|
||||||
|
|
||||||
if wiki != file_base:
|
|
||||||
continue
|
|
||||||
|
|
||||||
candidates.append((
|
|
||||||
path,
|
|
||||||
file,
|
|
||||||
file_name,
|
|
||||||
file_extension,
|
|
||||||
file_base,
|
|
||||||
file_date
|
|
||||||
))
|
|
||||||
|
|
||||||
return candidates
|
|
||||||
|
|
||||||
def error_no_candidate(filelist: list, wiki: str):
|
|
||||||
print(f"Could not find any match to {wiki}.\n", file=sys.stderr)
|
|
||||||
print(f"Here is a list of urls similar to your request:", file=sys.stderr)
|
|
||||||
|
|
||||||
for file_item in filelist:
|
|
||||||
url = urljoin(file_item[0], file_item[1])
|
|
||||||
if wiki in url:
|
|
||||||
print(f" - {url}", file=sys.stderr)
|
|
||||||
|
|
||||||
raise SystemExit("Aborting.")
|
def generate_download_list(wikis: list, zim_list: list) -> list:
|
||||||
|
result = list()
|
||||||
def get_download_files(wikis: list, filelist: list) -> list:
|
|
||||||
download_files = list()
|
|
||||||
|
|
||||||
for wiki in wikis:
|
for wiki in wikis:
|
||||||
candidates = get_download_candidates(filelist=filelist, wiki=wiki)
|
|
||||||
|
|
||||||
if not candidates:
|
current_zim = None
|
||||||
error_no_candidate(wiki=wiki, filelist=filelist)
|
for zim in zim_list:
|
||||||
|
if wiki == zim["file_base_without_date"]:
|
||||||
|
if current_zim is None:
|
||||||
|
current_zim = zim
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_zim_date = datetime.datetime.strptime(current_zim["file_date"], "%Y-%m")
|
||||||
|
zim_date = datetime.datetime.strptime(zim["file_date"], "%Y-%m")
|
||||||
|
if zim_date > current_zim_date:
|
||||||
|
current_zim = zim
|
||||||
|
|
||||||
# Get most current candidate
|
if current_zim is None:
|
||||||
candidate0 = None
|
raise NoMatchException(f"Could not find any matches for {wiki}", wiki=wiki)
|
||||||
for candidate in candidates:
|
|
||||||
if candidate0 is None:
|
result.append(current_zim)
|
||||||
candidate0 = candidate
|
|
||||||
continue
|
|
||||||
|
|
||||||
candidate0_date = datetime.datetime.strptime(candidate0[5], "%Y-%m")
|
|
||||||
candidate_date = datetime.datetime.strptime(candidate[5], "%Y-%m")
|
|
||||||
|
|
||||||
if candidate_date > candidate0_date:
|
return result
|
||||||
candidate0 = candidate
|
|
||||||
|
|
||||||
download_files.append((
|
|
||||||
wiki,
|
|
||||||
candidate0[3],
|
|
||||||
urljoin(candidate0[0], candidate0[1]),
|
|
||||||
candidate0[1]
|
|
||||||
))
|
|
||||||
|
|
||||||
return download_files
|
|
||||||
|
|
||||||
def check_is_newer(path: str, last_modified: str) -> bool:
|
def check_is_newer(path: str, last_modified: str) -> bool:
|
||||||
try:
|
try:
|
||||||
|
@ -132,43 +120,52 @@ def check_is_newer(path: str, last_modified: str) -> bool:
|
||||||
mtime_remote = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
|
mtime_remote = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
|
||||||
return mtime_remote.date() > mtime_local.date()
|
return mtime_remote.date() > mtime_local.date()
|
||||||
|
|
||||||
def download_wiki(session: requests.Session, title: str, src: str, dst: str):
|
def download_zim(session: requests.Session, zim: dict, dst: str):
|
||||||
|
|
||||||
|
# Get sha256 digest from server
|
||||||
|
try:
|
||||||
|
response = session.head(url=zim["url"])
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.HTTPError as err:
|
||||||
|
raise DownloadException(str(err))
|
||||||
|
|
||||||
# Get digest
|
|
||||||
response = session.head(url=src)
|
|
||||||
digests = response.headers.get("Digest").split(",")
|
digests = response.headers.get("Digest").split(",")
|
||||||
digest_value = None
|
|
||||||
for digest in digests:
|
for digest in digests:
|
||||||
method, data = digest.strip().split("=", 1)
|
method, data = digest.strip().split("=", 1)
|
||||||
if method == "SHA-256":
|
if method == "SHA-256":
|
||||||
digest_value = base64.b64decode(data)
|
digest_value = base64.b64decode(data)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
raise SystemExit(f"Could not get SHA-256 digest for {title}. Aborting.")
|
raise DownloadException(f"Could not get SHA-256 digest for {zim['file']}. Aborting.")
|
||||||
|
|
||||||
sha256 = hashlib.sha256()
|
sha256 = hashlib.sha256()
|
||||||
with session.get(url=src, stream=True) as response:
|
try:
|
||||||
response.raise_for_status()
|
with session.get(url=zim["url"], stream=True) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
if not check_is_newer(dst, response.headers.get("Last-Modified")):
|
if not check_is_newer(dst, response.headers.get("Last-Modified")):
|
||||||
print(f"{title} was not updated. Skipping...")
|
print(f"{zim['file_base_without_date']} was not updated on server. Skipping...")
|
||||||
return
|
return
|
||||||
|
|
||||||
total_size = int(response.headers.get("content-length", 0))
|
total_size = int(response.headers.get("content-length", 0))
|
||||||
block_size = 1024
|
block_size = 1024
|
||||||
with tqdm(total=total_size, unit="B", unit_scale=True, desc=title) as progress_bar:
|
with tqdm(total=total_size, unit="B", unit_scale=True, desc=zim['file_base_without_date']) as progress_bar:
|
||||||
with open(dst + '.part', "wb") as file:
|
with open(dst + '.part', "wb") as file:
|
||||||
for block in response.iter_content(block_size):
|
for block in response.iter_content(block_size):
|
||||||
progress_bar.update(len(block))
|
progress_bar.update(len(block))
|
||||||
file.write(block)
|
file.write(block)
|
||||||
sha256.update(block)
|
sha256.update(block)
|
||||||
|
|
||||||
|
except requests.exceptions.HTTPError as err:
|
||||||
|
raise DownloadException(str(err))
|
||||||
|
|
||||||
if digest_value != sha256.digest():
|
if digest_value != sha256.digest():
|
||||||
raise SystemExit(f"Checksum Error for {title}. Aborting.")
|
raise DownloadException(f"Checksum Error for {zim['file_base_without_date']}. Aborting.")
|
||||||
|
|
||||||
sha256sum = sha256.hexdigest()
|
sha256_hex = sha256.hexdigest()
|
||||||
with open(dst + '.sha256sum', 'w') as file:
|
with open(dst + '.sha256sum', 'w') as file:
|
||||||
file.write(f"{sha256sum} {dst}\n")
|
file.write(f"{sha256_hex} {dst}\n")
|
||||||
|
|
||||||
os.rename(dst + '.part', dst)
|
os.rename(dst + '.part', dst)
|
||||||
|
|
||||||
|
@ -178,28 +175,40 @@ def main():
|
||||||
# Create Session
|
# Create Session
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
|
||||||
# Get Filelist
|
# Read ZIM list from cache if enabled
|
||||||
filelist = None
|
zim_list = None
|
||||||
|
|
||||||
if args.cache == True:
|
if args.cache == True:
|
||||||
try:
|
try:
|
||||||
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(args.cache_file))
|
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(args.cache_file))
|
||||||
if datetime.datetime.now() - mtime < datetime.timedelta(days=1):
|
if datetime.datetime.now() - mtime < datetime.timedelta(days=args.cache_max_age):
|
||||||
filelist = load_cached_filelist(cache_file=args.cache_file)
|
zim_list = load_cache(cache_file=args.cache_file)
|
||||||
except (FileNotFoundError, CacheException):
|
except (FileNotFoundError, CacheException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if filelist is None:
|
# Retreive new list, if no cached file was loaded
|
||||||
filelist = retreive_filelist(session=session, url=KIWIX_BASE_URL)
|
if zim_list is None:
|
||||||
save_cached_filelist(cache_file=args.cache_file, filelist=filelist)
|
try:
|
||||||
|
zim_list = retreive_zim_list(session=session, url=KIWIX_BASE_URL)
|
||||||
|
except DownloadException as err:
|
||||||
|
raise SystemExit(str(err))
|
||||||
|
|
||||||
|
save_cache(cache_file=args.cache_file, zim_list=zim_list)
|
||||||
|
|
||||||
# Get download files list
|
# Get download list
|
||||||
download_files = get_download_files(wikis=args.wiki, filelist=filelist)
|
try:
|
||||||
|
download_list = generate_download_list(wikis=args.wiki, zim_list=zim_list)
|
||||||
|
except NoMatchException as err:
|
||||||
|
print(f"{str(err)}\n", file=sys.stderr)
|
||||||
|
print_fuzzy_matches(wiki=err.wiki, zim_list=zim_list)
|
||||||
|
raise SystemExit("\nAborting.")
|
||||||
|
|
||||||
# Download files
|
# Download zim files
|
||||||
for download_file in download_files:
|
try:
|
||||||
file_path = os.path.join(args.destination, download_file[0] + download_file[1])
|
for zim in download_list:
|
||||||
download_wiki(session=session, title=download_file[3], src=download_file[2], dst=file_path)
|
dst_path = os.path.join(args.destination, zim["file_base_without_date"] + zim["file_extension"])
|
||||||
|
download_zim(session=session, zim=zim, dst=dst_path)
|
||||||
|
except DownloadException as err:
|
||||||
|
raise SystemExit(str(err))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
Loading…
Reference in a new issue