From: Bill Erickson Date: Wed, 27 Oct 2021 17:01:14 +0000 (-0400) Subject: LP1844418 Elastic indexer isbn/issn cleaners X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=61c6523cc7849be92a8d2f6460e2f13d2b35ec84;p=working%2FEvergreen.git LP1844418 Elastic indexer isbn/issn cleaners Signed-off-by: Bill Erickson --- diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm index d412e75ef8..8e80f0abfa 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm @@ -765,6 +765,53 @@ sub index_pubdate { append_field_value($body, 'pubdate', $value) if $value; } +# TODO: move the clean_* subs to a separate module. + +# Returns 2 arrays of ISBNs. The first list are validated Business::ISBN +# objects: +# +# $isbn->as_isbn10->isbn # compact +# $isbn->as_isbn13->as_string # with hyphens +# +# The second list are raw string values whose only limiting factor is +# they be at least 10 characters long and contain numbers. +sub clean_isbns { + my $value = shift; + my @isbns; + my @strings; + + return (\@isbns, \@strings) unless $value; + + # Chop up the collected raw values into parts and let + # Business::* tell us which parts looks like ISBNs. + for my $token (split(/ /, $value)) { + if (length($token) > 9) { + my $isbn = Business::ISBN->new($token); + if ($isbn && $isbn->is_valid) { + push(@isbns, $isbn); + } elsif ($token =~ /\d+/) { + push(@strings, $token); + } + } + } + + return (\@isbns, \@strings); +} + +sub clean_issns { + my $value = shift; + return () unless $value; + my @issns; + + # Chop up the collected raw values into parts and let + # Business::* tell us which parts looks valid. + for my $token (split(/ /, $value)) { + my $issn = Business::ISSN->new($token); + push(@issns, $issn) if $issn && $issn->is_valid; + } + + return @issns; +} # Indexes ISBN10, ISBN13, and formatted values of both (with hyphens) sub index_isbns { @@ -772,7 +819,7 @@ sub index_isbns { return unless $value; my %seen; # deduplicate values - my @values = OpenILS::Utils::Normalize::clean_isbns($value); + my @values = clean_isbns($value); my $isbns = $values[0]; my $strings = $values[1]; @@ -799,7 +846,7 @@ sub index_issns { return unless $value; my %seen; # deduplicate values - my @issns = OpenILS::Utils::Normalize::clean_issns($value); + my @issns = clean_issns($value); for my $issn (@issns) { # no option in business::issn to get the unformatted value.