LP1844418 Elastic indexer isbn/issn cleaners
authorBill Erickson <berickxx@gmail.com>
Wed, 27 Oct 2021 17:01:14 +0000 (13:01 -0400)
committerBill Erickson <berickxx@gmail.com>
Mon, 13 Jun 2022 20:02:46 +0000 (16:02 -0400)
Signed-off-by: Bill Erickson <berickxx@gmail.com>
Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm

index d412e75..8e80f0a 100644 (file)
@@ -765,6 +765,53 @@ sub index_pubdate {
     append_field_value($body, 'pubdate', $value) if $value;
 }
 
+# TODO: move the clean_* subs to a separate module.
+
+# Returns 2 arrays of ISBNs.  The first list are validated Business::ISBN      
+# objects:                                                                     
+#                                                                              
+# $isbn->as_isbn10->isbn # compact                                             
+# $isbn->as_isbn13->as_string # with hyphens                                   
+#                                                                              
+# The second list are raw string values whose only limiting factor is          
+# they be at least 10 characters long and contain numbers.                     
+sub clean_isbns {                                                              
+    my $value = shift;                                                         
+    my @isbns;                                                                 
+    my @strings;                                                               
+                                                                               
+    return (\@isbns, \@strings) unless $value;                                 
+                                                                               
+    # Chop up the collected raw values into parts and let                      
+    # Business::* tell us which parts looks like ISBNs.                        
+    for my $token (split(/ /, $value)) {                                       
+        if (length($token) > 9) {                                              
+            my $isbn = Business::ISBN->new($token);                            
+            if ($isbn && $isbn->is_valid) {                                    
+                push(@isbns, $isbn);                                           
+            } elsif ($token =~ /\d+/) {                                        
+                push(@strings, $token);                                        
+            }                                                                  
+        }                                                                      
+    }                                                                          
+                                                                               
+    return (\@isbns, \@strings);                                               
+}                                                                              
+                                                                               
+sub clean_issns {                                                              
+    my $value = shift;                                                         
+    return () unless $value;                                                   
+    my @issns;                                                                 
+                                                                               
+    # Chop up the collected raw values into parts and let                      
+    # Business::* tell us which parts looks valid.                             
+    for my $token (split(/ /, $value)) {                                       
+        my $issn = Business::ISSN->new($token);                                
+        push(@issns, $issn) if $issn && $issn->is_valid;                       
+    }                                                                          
+                                                                               
+    return @issns;                                                             
+}                                                                              
 
 # Indexes ISBN10, ISBN13, and formatted values of both (with hyphens)
 sub index_isbns {
@@ -772,7 +819,7 @@ sub index_isbns {
     return unless $value;
     
     my %seen; # deduplicate values
-    my @values = OpenILS::Utils::Normalize::clean_isbns($value);
+    my @values = clean_isbns($value);
     my $isbns = $values[0];
     my $strings = $values[1];
 
@@ -799,7 +846,7 @@ sub index_issns {
     return unless $value;
 
     my %seen; # deduplicate values
-    my @issns = OpenILS::Utils::Normalize::clean_issns($value);
+    my @issns = clean_issns($value);
     
     for my $issn (@issns) {
         # no option in business::issn to get the unformatted value.