Add a date cutoff option for Scholars Portal books

author Dan Scott <dan@coffeecode.net>

Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)

committer Dan Scott <dan@coffeecode.net>

Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)
author Dan Scott <dan@coffeecode.net>
Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)
committer Dan Scott <dan@coffeecode.net>
Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 6a823cb..a31df9e 100755 (executable)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -24,7 +24,7 @@ import json
  import codecs
  import copy
  import requests
-from datetime import date
+import datetime
  from bs4 import BeautifulSoup
  import traceback
  
@@ -139,6 +139,9 @@ Optional arguments:
  
      -d / --duplicate : The name of the file to route ISBN duplicates to.
  
+    -D / --date : The ISO formatted date before which records should not be
+                  processed (specific to ScholarsPortal-sourced records)
+
      -F / --from-format : The format ('xml' or 'marc21') of the input file
  
      -T / --to-format : The format ('xml' or 'marc21') of the output file
@@ -178,6 +181,7 @@ def consolidate_options(opts):
          "-c": "--consortium",
          "-C": "--clean",
          "-d": "--duplicate",
+        "-D": "--date",
          "-e": "--ebrary",
          "-F": "--from-format",
          "-I": "--isbn-sfx",
@@ -255,6 +259,7 @@ def check_options(options):
      _string_opts = {
          "--authorization": "authorization",
          "--consortium": "consortium",
+        "--date": "date",
          "--duplicate": "duplicate",
          "--from-format": "from-format",
          "--note": "note",
@@ -281,6 +286,7 @@ def check_options(options):
      clean_opts["input"] = _input
      clean_opts["output"] = _output
      clean_opts["settings"] = Institution()
+    clean_opts["date"] = datetime.date.fromisoformat(clean_opts["date"])
  
      return clean_opts
  
@@ -411,7 +417,7 @@ def check_libraries(options):
  def parse_opts():
      """Get command-line arguments from the script"""
      try:
-        _short_opts = "i:o:a:p:P:ABLc:eCId:F:T:t:u:n:s:x:h"
+        _short_opts = "i:o:a:p:P:ABLc:eCIdD:F:T:t:u:n:s:x:h"
          _long_opts = [
              "input=",
              "output=",
@@ -426,6 +432,7 @@ def parse_opts():
              "clean",
              "isbn-sfx",
              "duplicate=",
+            "date=",
              "from-format=",
              "to-format=",
              "tcn=",
@@ -505,7 +512,22 @@ def process_record(record, options, files):
              else:
                  del dupe_flags["url"]
  
-        if dupe_flags:
+        date_cutoff = False
+
+        if options["date"]:
+            u = record["856"]["u"]
+            date_match = re.search(r"([\d]{4})-([\d]{2})-([\d]{2})", u)
+
+            cutoff = options["date"]
+            d = datetime.date(
+                int(date_match.group(1)),
+                int(date_match.group(2)),
+                int(date_match.group(3)),
+            )
+            if d <= cutoff:
+                date_cutoff = True
+
+        if dupe_flags or date_cutoff:
              DUP_COUNT += 1
          else:
              new_record = process_fields(record, options)
@@ -1113,7 +1135,14 @@ def add_marc_source(record, options):
      marc_source = pymarc.Field(
          tag="598",
          indicators=[" ", " "],
-        subfields=["a", source, "b", date.today().isoformat(), "c", str(RECORD_COUNT)],
+        subfields=[
+            "a",
+            source,
+            "b",
+            datetime.date.today().isoformat(),
+            "c",
+            str(RECORD_COUNT),
+        ],
      )
      record.add_ordered_field(marc_source)
author	Dan Scott <dan@coffeecode.net>
	Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)
committer	Dan Scott <dan@coffeecode.net>
	Tue, 7 Apr 2020 17:03:12 +0000 (13:03 -0400)