From 5749fb4db1f0f7c81ee1a0d3a55ceef36775e495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Marten=20Br=C3=BCggemann?= Date: Sat, 14 Sep 2024 01:17:30 +0200 Subject: [PATCH] fix matching --- sync_kiwix.py | 74 ++++++++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/sync_kiwix.py b/sync_kiwix.py index 989ef3d..282baeb 100644 --- a/sync_kiwix.py +++ b/sync_kiwix.py @@ -60,47 +60,18 @@ def retreive_filelist(session: requests.Session, url: str) -> list: return result -def find_wiki_files(filelist: list, wiki: str) -> list: - result = list() - for file in filelist: - if file[1].startswith(wiki): - result.append(file) - return result - -def error_multiple_files(wiki: str, wiki_files: str): - print(f"{wiki} has multiple matches. Please specify your input more precisely.\n", file=sys.stderr) - print(f"{wiki} matched to:", file=sys.stderr) - - for file in wiki_files: - url = urljoin(file[0], file[1]) - print(f" - {url}", file=sys.stderr) - - raise SystemExit("Aborting.") - -def get_download_candidates(wiki_files: list, wiki: str) -> dict: +def get_download_candidates(filelist: list, wiki: str) -> dict: candidates = list() - # Check if results are unique - path0 = None - file_base0 = None - - for wiki_file in wiki_files: - path = wiki_file[0] - file = wiki_file[1] - - if path0 is None: - path0 = path - if path0 != path: - raise MultipleFileException(wiki) - + for file_item in filelist: + path = file_item[0] + file = file_item[1] file_name, file_extension = os.path.splitext(file) file_base, file_date = file_name.rsplit('_', 1) - if file_base0 is None: - file_base0 = file_base - if file_base0 != file_base: - raise MultipleFileException(wiki) + if wiki != file_base: + continue candidates.append(( path, @@ -113,15 +84,25 @@ def get_download_candidates(wiki_files: list, wiki: str) -> dict: return candidates -def get_wiki_files(wikis: list, filelist: list) -> list: +def error_no_candidate(filelist: list, wiki: str): + print(f"Could not find any match to {wiki}.\n", file=sys.stderr) + print(f"Here is a list of urls similar to your request:", file=sys.stderr) + + for file_item in filelist: + url = urljoin(file_item[0], file_item[1]) + if wiki in url: + print(f" - {url}", file=sys.stderr) + + raise SystemExit("Aborting.") + +def get_download_files(wikis: list, filelist: list) -> list: download_files = list() for wiki in wikis: - wiki_files = find_wiki_files(filelist=filelist, wiki=wiki) - try: - candidates = get_download_candidates(wiki_files=wiki_files, wiki=wiki) - except MultipleFileException: - error_multiple_files(wiki=wiki, wiki_files=wiki_files) + candidates = get_download_candidates(filelist=filelist, wiki=wiki) + + if not candidates: + error_no_candidate(wiki=wiki, filelist=filelist) # Get most current candidate candidate0 = None @@ -136,9 +117,6 @@ def get_wiki_files(wikis: list, filelist: list) -> list: if candidate_date > candidate0_date: candidate0 = candidate - if candidate0 is None: - raise SystemExit(f"Could not find any download candidate for {wiki}. Aborting.") - download_files.append(( wiki, candidate0[3], @@ -217,12 +195,12 @@ def main(): save_cached_filelist(cache_file=args.cache_file, filelist=filelist) # Get downlaod files list - wiki_files = get_wiki_files(wikis=args.wiki, filelist=filelist) + download_files = get_download_files(wikis=args.wiki, filelist=filelist) # Download files - for wiki_file in wiki_files: - file_path = os.path.join(args.destination, wiki_file[0] + wiki_file[1]) - download_wiki(session=session, title=wiki_file[3], src=wiki_file[2], dst=file_path) + for download_file in download_files: + file_path = os.path.join(args.destination, download_file[0] + download_file[1]) + download_wiki(session=session, title=download_file[3], src=download_file[2], dst=file_path) if __name__ == "__main__": main() \ No newline at end of file