Add a date cutoff option for Scholars Portal books
authorDan Scott <dan@coffeecode.net>
Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)
committerDan Scott <dan@coffeecode.net>
Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)
Signed-off-by: Dan Scott <dan@coffeecode.net>
tools/ebooks/prep_ebook_records.py

index 6a823cb..a31df9e 100755 (executable)
@@ -24,7 +24,7 @@ import json
 import codecs
 import copy
 import requests
-from datetime import date
+import datetime
 from bs4 import BeautifulSoup
 import traceback
 
@@ -139,6 +139,9 @@ Optional arguments:
 
     -d / --duplicate : The name of the file to route ISBN duplicates to.
 
+    -D / --date : The ISO formatted date before which records should not be
+                  processed (specific to ScholarsPortal-sourced records)
+
     -F / --from-format : The format ('xml' or 'marc21') of the input file
 
     -T / --to-format : The format ('xml' or 'marc21') of the output file
@@ -178,6 +181,7 @@ def consolidate_options(opts):
         "-c": "--consortium",
         "-C": "--clean",
         "-d": "--duplicate",
+        "-D": "--date",
         "-e": "--ebrary",
         "-F": "--from-format",
         "-I": "--isbn-sfx",
@@ -255,6 +259,7 @@ def check_options(options):
     _string_opts = {
         "--authorization": "authorization",
         "--consortium": "consortium",
+        "--date": "date",
         "--duplicate": "duplicate",
         "--from-format": "from-format",
         "--note": "note",
@@ -281,6 +286,7 @@ def check_options(options):
     clean_opts["input"] = _input
     clean_opts["output"] = _output
     clean_opts["settings"] = Institution()
+    clean_opts["date"] = datetime.date.fromisoformat(clean_opts["date"])
 
     return clean_opts
 
@@ -411,7 +417,7 @@ def check_libraries(options):
 def parse_opts():
     """Get command-line arguments from the script"""
     try:
-        _short_opts = "i:o:a:p:P:ABLc:eCId:F:T:t:u:n:s:x:h"
+        _short_opts = "i:o:a:p:P:ABLc:eCIdD:F:T:t:u:n:s:x:h"
         _long_opts = [
             "input=",
             "output=",
@@ -426,6 +432,7 @@ def parse_opts():
             "clean",
             "isbn-sfx",
             "duplicate=",
+            "date=",
             "from-format=",
             "to-format=",
             "tcn=",
@@ -505,7 +512,22 @@ def process_record(record, options, files):
             else:
                 del dupe_flags["url"]
 
-        if dupe_flags:
+        date_cutoff = False
+
+        if options["date"]:
+            u = record["856"]["u"]
+            date_match = re.search(r"([\d]{4})-([\d]{2})-([\d]{2})", u)
+
+            cutoff = options["date"]
+            d = datetime.date(
+                int(date_match.group(1)),
+                int(date_match.group(2)),
+                int(date_match.group(3)),
+            )
+            if d <= cutoff:
+                date_cutoff = True
+
+        if dupe_flags or date_cutoff:
             DUP_COUNT += 1
         else:
             new_record = process_fields(record, options)
@@ -1113,7 +1135,14 @@ def add_marc_source(record, options):
     marc_source = pymarc.Field(
         tag="598",
         indicators=[" ", " "],
-        subfields=["a", source, "b", date.today().isoformat(), "c", str(RECORD_COUNT)],
+        subfields=[
+            "a",
+            source,
+            "b",
+            datetime.date.today().isoformat(),
+            "c",
+            str(RECORD_COUNT),
+        ],
     )
     record.add_ordered_field(marc_source)