from aim.services import S
from aim.digifeeds.db_client import DBClient
from aim.digifeeds.item import get_item
from pathlib import Path
from rclone_python import rclone
from datetime import datetime, timedelta, date
import csv
import tempfile
[docs]
def list_barcodes_potentially_in_hathifiles():
items = DBClient().get_items(q="status:pending_deletion -status:in_hathifiles")
if items:
return [item["barcode"] for item in items]
[docs]
def last_two_weeks_rclone_filter(start_date: datetime = datetime.today()):
day_count = 14
dates = []
for single_date in (start_date - timedelta(n) for n in range(day_count)):
formatted_date = single_date.strftime("%Y-%m-%d")
dates.append(f"{formatted_date}*")
joined = ",".join(dates)
return f"{{{joined}}}"
[docs]
def barcodes_added_in_last_two_weeks():
files = rclone.ls(
path=f"{S.digifeeds_s3_rclone_remote}:{S.digifeeds_s3_processed_path}",
args=[f'--include "{last_two_weeks_rclone_filter()}"'],
files_only=True,
max_depth=2,
)
output = []
for file in files:
barcode = file["Name"].split("_")[2].split(".")[0]
date = file["Name"].split("_")[0]
S.logger.info(
"added_to_barcode_report",
barcode=barcode,
message="Added to barcode report",
)
output.append([filemaker_date(date), barcode])
return output
[docs]
def write_and_send_report_to_mayhem(
content,
base_name,
rclone_path="",
report_file=None,
):
if not report_file:
report_file = tempfile.NamedTemporaryFile()
with open(report_file.name, "w") as rf:
writer = csv.writer(rf, delimiter="\t", lineterminator="\n")
S.logger.info("writing_report_rows_to_file")
writer.writerows(content)
S.logger.info("writing delivery report")
today = date.today().isoformat()
rclone.copyto(
in_path=report_file.name,
out_path=f"{rclone_path}/{today}_{base_name}.txt",
)
[docs]
def generate_barcodes_added_in_last_two_weeks_report():
content = barcodes_added_in_last_two_weeks()
write_and_send_report_to_mayhem(
content=content,
base_name="barcodes_in_s3_processed",
rclone_path=f"{S.digifeeds_fileserver_rclone_remote}:{S.digifeeds_delivery_reports_path}",
)
[docs]
def barcodes_in_hathifiles_in_last_two_weeks():
two_weeks_ago = date.today() - timedelta(14)
items = DBClient().get_items(
q=f"status.in_hathifiles.created_at>={two_weeks_ago.isoformat()}"
)
if items:
return [
[
item["barcode"],
filemaker_date(item["hathifiles_timestamp"]),
hathitrust_url(item["barcode"]),
]
for item in items
]
else:
return []
[docs]
def generate_barcodes_in_hathifiles_report():
content = barcodes_in_hathifiles_in_last_two_weeks()
write_and_send_report_to_mayhem(
content=content,
base_name="barcodes_in_hathifiles",
rclone_path=f"{S.digifeeds_fileserver_rclone_remote}:{S.digifeeds_hathifiles_reports_path}",
)
[docs]
def prune_processed_barcodes(rclone_path: str, location: str):
data_structure = {}
files_and_directories = rclone.ls(
path=rclone_path,
files_only=False,
max_depth=1,
)
for f in files_and_directories:
barcode = barcode_from_name(f["Name"])
if barcode not in data_structure:
data_structure[barcode] = [f]
else:
data_structure[barcode].append(f)
for barcode in data_structure.keys():
db_item = get_item(barcode)
if db_item.has_status("in_hathifiles"):
for item in data_structure[barcode]:
if item["IsDir"]:
rclone.purge(path=f"{rclone_path}/{item['Path']}")
else:
rclone.delete(path=f"{rclone_path}/{item['Path']}")
S.logger.info(
"prune",
barcode=barcode,
message="removed because it was found in the hathifiles",
)
db_item.add_status(barcode=barcode, status=f"pruned_from_{location}")
else:
S.logger.info(
"not_in_hathifiles",
barcode=barcode,
message="not pruned because not found in hathifiles",
)
[docs]
def barcode_from_name(name):
return name.split(".")[0].split("_")[-1]
[docs]
def filemaker_date(datestr: str) -> str:
date = datetime.fromisoformat(datestr)
return date.strftime("%m/%d/%Y")
[docs]
def hathitrust_url(barcode: str) -> str:
return f"https://babel.hathitrust.org/cgi/pt?id=mdp.{barcode}"