fix regex for files

This commit is contained in:
Jan-Marten Brüggemann 2024-09-14 00:40:17 +02:00
parent d153e41715
commit 3b368d1a81

View file

@ -43,21 +43,12 @@ def save_cached_filelist(cache_file: str, filelist: list):
with open(file=cache_file, mode='w') as file: with open(file=cache_file, mode='w') as file:
json.dump(filelist, file) json.dump(filelist, file)
def retreive_filelist(session: requests.Session, url: str, cache_file: str = "", cache_max_age: datetime.timedelta = datetime.timedelta(days=1)) -> list: def retreive_filelist(session: requests.Session, url: str) -> list:
if cache_file != "":
try:
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(cache_file))
if datetime.datetime.now() - mtime < cache_max_age:
return load_cached_filelist(cache_file=cache_file)
except (FileNotFoundError, CacheException):
pass
response = session.get(url) response = session.get(url)
response.raise_for_status() response.raise_for_status()
directories = re.findall(r'<a href="(\w+\/)">', response.text) directories = re.findall(r'<a href="(\w+\/)">', response.text)
files = re.findall(r'<a href="([\w-]+\.zim)">', response.text) files = re.findall(r'<a href="([\w\-\.]+\.zim)">', response.text)
result = list() result = list()
@ -67,9 +58,6 @@ def retreive_filelist(session: requests.Session, url: str, cache_file: str = "",
for file in files: for file in files:
result.append([url, file]) result.append([url, file])
if cache_file != "":
save_cached_filelist(cache_file=cache_file, filelist=result)
return result return result
def find_wiki_files(filelist: list, wiki: str) -> list: def find_wiki_files(filelist: list, wiki: str) -> list:
@ -104,7 +92,7 @@ def get_download_candidates(wiki_files: list, wiki: str) -> dict:
path0 = path path0 = path
if path0 != path: if path0 != path:
raise MultipleFileException(wiki) raise MultipleFileException(wiki)
file_name, file_extension = os.path.splitext(file) file_name, file_extension = os.path.splitext(file)
file_base, file_date = file_name.rsplit('_', 1) file_base, file_date = file_name.rsplit('_', 1)
@ -148,6 +136,9 @@ def get_wiki_files(wikis: list, filelist: list) -> list:
if candidate_date > candidate0_date: if candidate_date > candidate0_date:
candidate0 = candidate candidate0 = candidate
if candidate0 is None:
raise SystemExit(f"Could not find any download candidate for {wiki}. Aborting.")
download_files.append(( download_files.append((
wiki, wiki,
candidate0[3], candidate0[3],
@ -211,7 +202,19 @@ def main():
session = requests.Session() session = requests.Session()
# Get Filelist # Get Filelist
filelist = retreive_filelist(session=session, url=KIWIX_BASE_URL, cache_file=args.cache_file if args.cache else "") filelist = None
if args.cache == True:
try:
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(args.cache_file))
if datetime.datetime.now() - mtime < datetime.timedelta(days=1):
filelist = load_cached_filelist(cache_file=args.cache_file)
except (FileNotFoundError, CacheException):
pass
if filelist is None:
filelist = retreive_filelist(session=session, url=KIWIX_BASE_URL)
save_cached_filelist(cache_file=args.cache_file, filelist=filelist)
# Get downlaod files list # Get downlaod files list
wiki_files = get_wiki_files(wikis=args.wiki, filelist=filelist) wiki_files = get_wiki_files(wikis=args.wiki, filelist=filelist)