From e4594d161b2dedda7160922fae92dc635009ad93 Mon Sep 17 00:00:00 2001 From: dbs Date: Sun, 30 Jan 2011 04:22:08 +0000 Subject: [PATCH] Correct authority browsing for reals First, restore the >= enable before and after ranges in authority_tag_sf_browse(), after I mistakenly removed it in r19131; the second storage request for $after does not stomp on the prior $before results, it simply gets pushed onto the carefully constructed list of $before results, ensuring that our target is in the middle of page 0. Second, we're treating all of the "tag" members in the method registration as list references now (for the purpose of searching against 4xx/5xx in the .refs. variants), but that was blowing up when we registered just a single tag as a string and tried to treat the scalar as a list reference. I could have checked to see if what we had incoming was a reference and dance accordingly, but opted to just define all single-tag entries as single-element arrays instead. Applied the same to startwith. Finally, in r19331 I had used chop() to ensure that an exact match for startwith would be returned as element 1 on page 0, instead of appearing as the last element of page -1. I had said that the right way to do this would be to naco_normalize() the value to match the normalized afr.value, and so this is what I have done. Rather than torturously using O:A:Storage:FTS to get at the naco_normalize() definition, I moved the function into its own Utils package and adjusted its usage accordingly through the affected code. One step closer to single-sourcing the function in the database, as well? git-svn-id: svn://svn.open-ils.org/ILS/trunk@19332 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- .../perlmods/OpenILS/Application/Storage/FTS.pm | 67 +-------------------- .../Application/Storage/Publisher/authority.pm | 3 +- .../src/perlmods/OpenILS/Application/SuperCat.pm | 36 ++++++----- Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm | 70 ++++++++++++++++++++++ Open-ILS/tests/naco_normalize.t | 10 ++-- Open-ILS/xul/staff_client/server/cat/marcedit.js | 3 +- 6 files changed, 99 insertions(+), 90 deletions(-) create mode 100644 Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm index 321333c5b8..05c4decc8c 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm @@ -5,8 +5,7 @@ my $log = 'OpenSRF::Utils::Logger'; package OpenILS::Application::Storage::FTS; use OpenSRF::Utils::Logger qw/:level/; use Parse::RecDescent; -use Unicode::Normalize; -use Encode; +use OpenILS::Utils::Normalize qw( naco_normalize ); my $_default_grammar_parser = new Parse::RecDescent ( <<'GRAMMAR' ); @@ -28,70 +27,6 @@ numeric_range: /\d+-\d*/ GRAMMAR -# FIXME - this is a copy-and-paste of the naco_normalize -# stored procedure -sub naco_normalize { - - my $str = decode_utf8(shift); - my $sf = shift; - - # Apply NACO normalization to input string; based on - # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf - # - # Note that unlike a strict reading of the NACO normalization rules, - # output is returned as lowercase instead of uppercase for compatibility - # with previous versions of the Evergreen naco_normalize routine. - - # Convert to upper-case first; even though final output will be lowercase, doing this will - # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. - # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. - $str = uc $str; - - # remove non-filing strings - $str =~ s/\x{0098}.*?\x{009C}//g; - - $str = NFKD($str); - - # additional substitutions - 3.6. - $str =~ s/\x{00C6}/AE/g; - $str =~ s/\x{00DE}/TH/g; - $str =~ s/\x{0152}/OE/g; - $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; - - # transformations based on Unicode category codes - $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; - - if ($sf && $sf =~ /^a/o) { - my $commapos = index($str, ','); - if ($commapos > -1) { - if ($commapos != length($str) - 1) { - $str =~ s/,/\x07/; # preserve first comma - } - } - } - - # since we've stripped out the control characters, we can now - # use a few as placeholders temporarily - $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; - $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; - $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; - - # decimal digits - $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; - - # intentionally skipping step 8 of the NACO algorithm; if the string - # gets normalized away, that's fine. - - # leading and trailing spaces - $str =~ s/\s+/ /g; - $str =~ s/^\s+//; - $str =~ s/\s+$//g; - - return lc $str; -} - -#' stupid vim syntax highlighting ... - sub compile { $log->debug("You must override me somewhere, or I will make searching really slow!!!!",ERROR);; diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm index 62091ce7e0..f21530f745 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm @@ -4,6 +4,7 @@ use vars qw/$VERSION/; use OpenSRF::EX qw/:try/; use OpenILS::Application::Storage::FTS; use OpenILS::Utils::Fieldmapper; +use OpenILS::Utils::Normalize qw( naco_normalize ); use OpenSRF::Utils::Logger qw/:level/; use OpenSRF::Utils::Cache; use Data::Dumper; @@ -33,7 +34,7 @@ sub validate_tag { for my $t ( @tags ) { for my $search ( @searches ) { my $sf = $$search{subfield}; - my $term = OpenILS::Application::Storage::FTS::naco_normalize($$search{term}, $sf); + my $term = naco_normalize($$search{term}, $sf); $tag = [$tag] if (!ref($tag)); diff --git a/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm b/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm index 3a85a22c2b..eed53b1bc7 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm @@ -14,6 +14,7 @@ package OpenILS::Application::SuperCat; use strict; use warnings; +use OpenILS::Utils::Normalize qw( naco_normalize ); # All OpenSRF applications must be based on OpenSRF::Application or # a subclass thereof. Makes sense, eh? @@ -863,7 +864,7 @@ sub general_authority_browse { __PACKAGE__->register_method( method => 'general_authority_browse', api_name => 'open-ils.supercat.authority.title.browse', - tag => '130', subfield => 'a', + tag => ['130'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -908,7 +909,7 @@ __PACKAGE__->register_method( __PACKAGE__->register_method( method => 'general_authority_browse', api_name => 'open-ils.supercat.authority.topic.browse', - tag => '150', subfield => 'a', + tag => ['150'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -923,7 +924,7 @@ __PACKAGE__->register_method( __PACKAGE__->register_method( method => 'general_authority_browse', api_name => 'open-ils.supercat.authority.title.refs.browse', - tag => '130', subfield => 'a', + tag => ['130'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -968,7 +969,7 @@ __PACKAGE__->register_method( __PACKAGE__->register_method( method => 'general_authority_browse', api_name => 'open-ils.supercat.authority.topic.refs.browse', - tag => '150', subfield => 'a', + tag => ['150'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -991,6 +992,9 @@ sub authority_tag_sf_browse { my $page_size = shift || 9; my $page = shift || 0; + # Match authority.full_rec normalization + $value = naco_normalize($value, $subfield); + my ($before_limit,$after_limit) = (0,0); my ($before_offset,$after_offset) = (0,0); @@ -1016,13 +1020,13 @@ sub authority_tag_sf_browse { } my @list = (); - if ($page < 0) { + if ($page <= 0) { my $before = $_storage->request( "open-ils.cstore.json_query.atomic", { select => { afr => [qw/record value/] }, from => { 'are', 'afr' }, where => { - '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => lc($value) } }, + '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => $value } }, '+are' => { 'deleted' => 'f' } }, order_by => { afr => { value => 'desc' } }, @@ -1039,7 +1043,7 @@ sub authority_tag_sf_browse { { select => { afr => [qw/record value/] }, from => { 'are', 'afr' }, where => { - '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => lc($value) } }, + '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => $value } }, '+are' => { 'deleted' => 'f' } }, order_by => { afr => { value => 'asc' } }, @@ -1374,7 +1378,7 @@ sub general_authority_startwith { __PACKAGE__->register_method( method => 'general_authority_startwith', api_name => 'open-ils.supercat.authority.title.startwith', - tag => '130', subfield => 'a', + tag => ['130'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -1419,7 +1423,7 @@ __PACKAGE__->register_method( __PACKAGE__->register_method( method => 'general_authority_startwith', api_name => 'open-ils.supercat.authority.topic.startwith', - tag => '150', subfield => 'a', + tag => ['150'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -1434,7 +1438,7 @@ __PACKAGE__->register_method( __PACKAGE__->register_method( method => 'general_authority_startwith', api_name => 'open-ils.supercat.authority.title.refs.startwith', - tag => '130', subfield => 'a', + tag => ['130'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -1479,7 +1483,7 @@ __PACKAGE__->register_method( __PACKAGE__->register_method( method => 'general_authority_startwith', api_name => 'open-ils.supercat.authority.topic.refs.startwith', - tag => '150', subfield => 'a', + tag => ['150'], subfield => 'a', api_level => 1, argc => 1, signature => @@ -1503,8 +1507,8 @@ sub authority_tag_sf_startwith { my $limit = shift || 10; my $page = shift || 0; - # Chop the last character from the incoming value to return it on page 0 - chop($value); + # Match authority.full_rec normalization + $value = naco_normalize($value, $subfield); my $ref_limit = $limit; my $offset = $limit * abs($page); @@ -1531,7 +1535,7 @@ sub authority_tag_sf_startwith { { select => { afr => [qw/record value/] }, from => { 'afr', 'are' }, where => { - '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => lc($value) } }, + '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => $value } }, '+are' => { deleted => 'f' } }, order_by => { afr => { value => 'desc' } }, @@ -1548,7 +1552,7 @@ sub authority_tag_sf_startwith { { select => { afr => [qw/record value/] }, from => { 'afr', 'are' }, where => { - '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => lc($value) } }, + '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => $value } }, '+are' => { deleted => 'f' } }, order_by => { afr => { value => 'asc' } }, @@ -3391,4 +3395,4 @@ sub as_xml { 1; -# vim: noet:ts=4:sw=4 +# vim: et:ts=4:sw=4 diff --git a/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm new file mode 100644 index 0000000000..d71503c5e1 --- /dev/null +++ b/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm @@ -0,0 +1,70 @@ +package OpenILS::Utils::Normalize; +use strict; +use warnings; +use Unicode::Normalize; +use Encode; + +use Exporter 'import'; +our @EXPORT_OK = qw( naco_normalize ); + +sub naco_normalize { + + my $str = decode_utf8(shift); + my $sf = shift; + + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. + + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; + + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; + + $str = NFKD($str); + + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; + + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($str, ','); + if ($commapos > -1) { + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma + } + } + } + + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; + + return lc $str; +} + +1; diff --git a/Open-ILS/tests/naco_normalize.t b/Open-ILS/tests/naco_normalize.t index 34dbe03bdf..182ebab67d 100644 --- a/Open-ILS/tests/naco_normalize.t +++ b/Open-ILS/tests/naco_normalize.t @@ -6,7 +6,7 @@ use Test::More tests => 50; use Unicode::Normalize; use DBI; -use OpenILS::Application::Storage::FTS; +use OpenILS::Utils::Normalize qw( naco_normalize ); # This could be made better in at least one of two ways (or both); # 1. put PL/Perl code that doesn't require a database into external @@ -57,13 +57,13 @@ my @test_cases = ( [ '♭©®♯', '♭ ♯', 'other symbols' ], ); -# test copy of naco_normalize in OpenILS::Application::Storage::FTS +# test copy of naco_normalize in OpenILS::Utils::Normalize foreach my $case (@test_cases) { - is(OpenILS::Application::Storage::FTS::naco_normalize($case->[0]), $case->[1], $case->[2] . ' (FTS.pm)'); + is(naco_normalize($case->[0]), $case->[1], $case->[2] . ' (Normalize.pm)'); } -is(OpenILS::Application::Storage::FTS::naco_normalize('Smith, Jane. Poet, painter, and author', 'a'), +is(naco_normalize('Smith, Jane. Poet, painter, and author', 'a'), 'smith, jane poet painter and author', - 'retain first comma (FTS.pm)'); + 'retain first comma (Normalize.pm)'); SKIP: { my $dbh = DBI->connect($dsn, $db_user, $db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, PrintError => 0}); diff --git a/Open-ILS/xul/staff_client/server/cat/marcedit.js b/Open-ILS/xul/staff_client/server/cat/marcedit.js index c231e9a344..2dfd226367 100644 --- a/Open-ILS/xul/staff_client/server/cat/marcedit.js +++ b/Open-ILS/xul/staff_client/server/cat/marcedit.js @@ -2375,8 +2375,7 @@ function browseAuthority (sf_popup, menu_id, target, sf, limit, page) { page = 0; } - /* temporarily use 'startwith' instead of 'browse' until browse is repaired */ - var url = '/opac/extras/startwith/marcxml/' + var url = '/opac/extras/browse/marcxml/' + type + '.refs' + '/1' // OU - currently unscoped + '/' + sf.toString() -- 2.11.0