From: Chris Sharp Date: Mon, 17 Sep 2012 22:39:55 +0000 (-0400) Subject: more changes X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=1f806df4be0b610c8162b1e2844081de079c53ff;p=evergreen%2Fpines.git more changes --- diff --git a/Open-ILS/src/sql/Pg/version-upgrade/pines-2.1-2.2-upgrade-pt-2.sql b/Open-ILS/src/sql/Pg/version-upgrade/pines-2.1-2.2-upgrade-pt-2.sql index 2d4bd31cb1..96660f1025 100644 --- a/Open-ILS/src/sql/Pg/version-upgrade/pines-2.1-2.2-upgrade-pt-2.sql +++ b/Open-ILS/src/sql/Pg/version-upgrade/pines-2.1-2.2-upgrade-pt-2.sql @@ -11921,3689 +11921,6 @@ UPDATE authority.record_entry SET control_set = NULL WHERE id IN (SELECT record COMMIT; - -\qecho ************************************************************************ -\qecho The following transaction, wrapping upgrade 0672, may take a while. If -\qecho it takes an unduly long time, try it outside of a transaction. -\qecho ************************************************************************ - -BEGIN; - --- Evergreen DB patch 0672.fix-nonfiling-titles.sql --- --- Titles that begin with non-filing articles using apostrophes --- (for example, "L'armée") get spaces injected between the article --- and the subsequent text, which then breaks searching for titles --- beginning with those articles. --- --- This patch adds a nonfiling title element to MODS32 that can then --- be used to retrieve the title proper without affecting the spaces --- in the title. It's what we want, what we really really want, for --- title searches. --- - - --- check whether patch can be applied -SELECT evergreen.upgrade_deps_block_check('0672', :eg_version); - --- Update the XPath definition before the titleNonfiling element exists; --- but are you really going to read through the whole XSL below before --- seeing this important bit? -UPDATE config.metabib_field - SET xpath = $$//mods32:mods/mods32:titleNonfiling[mods32:title and not (@type)]$$, - format = 'mods32' - WHERE field_class = 'title' AND name = 'proper'; - -UPDATE config.xml_transform SET xslt=$$ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - BK - SE - - - BK - MM - CF - MP - VM - MU - - - - - - - - - b - afgk - - - - - abfgk - - - - - - - - - - - - - - - - - - <xsl:value-of select="substring($titleChop,@ind2+1)"/> - - - - - <xsl:value-of select="$titleChop"/> - - - - - - - - - b - b - afgk - - - - - - - - - - - - - - b - afgk - - - - - abfgk - - - - - - <xsl:value-of select="$title"/> - - - - - - - b - b - afgk - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">a</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <!-- 1/04 removed $h, b --> - <xsl:with-param name="codes">a</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <!-- 1/04 removed $h, $b --> - <xsl:with-param name="codes">af</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - <xsl:variable name="str"> - <xsl:for-each select="marc:subfield"> - <xsl:if test="(contains('adfklmor',@code) and (not(../marc:subfield[@code='n' or @code='p']) or (following-sibling::marc:subfield[@code='n' or @code='p'])))"> - <xsl:value-of select="text()"/> - <xsl:text> </xsl:text> - </xsl:if> - </xsl:for-each> - </xsl:variable> - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:value-of select="substring($str,1,string-length($str)-1)"/> - </xsl:with-param> - </xsl:call-template> - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">ah</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - creator - - - - - - - - - creator - - - - - - - - - creator - - - - - - - - - - - - - - - - - - - - - - - - - - - - personal - - - - - - - - - - - yes - - - yes - - - text - cartographic - notated music - sound recording-nonmusical - sound recording-musical - still image - moving image - three dimensional object - software, multimedia - mixed material - - - - globe - - - remote sensing image - - - - - - map - - - atlas - - - - - - - - database - - - loose-leaf - - - series - - - newspaper - - - periodical - - - web site - - - - - - - - abstract or summary - - - bibliography - - - catalog - - - dictionary - - - encyclopedia - - - handbook - - - legal article - - - index - - - discography - - - legislation - - - theses - - - survey of literature - - - review - - - programmed text - - - filmography - - - directory - - - statistics - - - technical report - - - legal case and case notes - - - law report or digest - - - treaty - - - - - - conference publication - - - - - - - - numeric data - - - database - - - font - - - game - - - - - - patent - - - festschrift - - - - biography - - - - - essay - - - drama - - - comic strip - - - fiction - - - humor, satire - - - letter - - - novel - - - short story - - - speech - - - - - - - biography - - - conference publication - - - drama - - - essay - - - fiction - - - folktale - - - history - - - humor, satire - - - memoir - - - poetry - - - rehearsal - - - reporting - - - sound - - - speech - - - - - - - art original - - - kit - - - art reproduction - - - diorama - - - filmstrip - - - legal article - - - picture - - - graphic - - - technical drawing - - - motion picture - - - chart - - - flash card - - - microscope slide - - - model - - - realia - - - slide - - - transparency - - - videorecording - - - toy - - - - - - - - - - abvxyz - - - - - - - - - - - code - marccountry - - - - - - - - code - iso3166 - - - - - - - - text - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - :,;/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - monographic - continuing - - - - - - - ab - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - reformatted digital - - - digitized microfilm - - - digitized other analog - - - - - - - - - - - - - - - -
braille
-
- -
print
-
- -
electronic
-
- -
microfiche
-
- -
microfilm
-
-
- - -
- - - - - -
-
- -
- - - - - -
-
- -
- - - - - -
-
- -
- - - - - -
-
- -
- - - - - -
-
- -
- - - - - -
-
- -
- -
-
- - - - access - - - preservation - - - replacement - - - - - -
chip cartridge
-
- -
computer optical disc cartridge
-
- -
magnetic disc
-
- -
magneto-optical disc
-
- -
optical disc
-
- -
remote
-
- -
tape cartridge
-
- -
tape cassette
-
- -
tape reel
-
- - -
celestial globe
-
- -
earth moon globe
-
- -
planetary or lunar globe
-
- -
terrestrial globe
-
- - -
kit
-
- - -
atlas
-
- -
diagram
-
- -
map
-
- -
model
-
- -
profile
-
- -
remote-sensing image
-
- -
section
-
- -
view
-
- - -
aperture card
-
- -
microfiche
-
- -
microfiche cassette
-
- -
microfilm cartridge
-
- -
microfilm cassette
-
- -
microfilm reel
-
- -
microopaque
-
- - -
film cartridge
-
- -
film cassette
-
- -
film reel
-
- - -
chart
-
- -
collage
-
- -
drawing
-
- -
flash card
-
- -
painting
-
- -
photomechanical print
-
- -
photonegative
-
- -
photoprint
-
- -
picture
-
- -
print
-
- -
technical drawing
-
- - -
notated music
-
- - -
filmslip
-
- -
filmstrip cartridge
-
- -
filmstrip roll
-
- -
other filmstrip type
-
- -
slide
-
- -
transparency
-
- -
remote-sensing image
-
- -
cylinder
-
- -
roll
-
- -
sound cartridge
-
- -
sound cassette
-
- -
sound disc
-
- -
sound-tape reel
-
- -
sound-track film
-
- -
wire recording
-
- - -
braille
-
- -
combination
-
- -
moon
-
- -
tactile, with no writing system
-
- - -
braille
-
- -
large print
-
- -
regular print
-
- -
text in looseleaf binder
-
- - -
videocartridge
-
- -
videocassette
-
- -
videodisc
-
- -
videoreel
-
- - - - - - - - - - abce - - - -
- - - - - - - - - - ab - - - - - - - - agrt - - - - - - - ab - - - - - - - - - adolescent - - - adult - - - general - - - juvenile - - - preschool - - - specialized - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - defg - - - - - - - - - - - - marcgac - - - - - - iso3166 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ab - - - - - - - abx - - - - - - - ab - - - - - - - - - - - - - - - - - - - - - - - - - - - - ab - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">av</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">av</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - abcx3 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="specialSubfieldSelect"> - <xsl:with-param name="anyCodes">tfklmorsv</xsl:with-param> - <xsl:with-param name="axis">t</xsl:with-param> - <xsl:with-param name="afterCodes">g</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - aq - t - g - - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="specialSubfieldSelect"> - <xsl:with-param name="anyCodes">tfklmorsv</xsl:with-param> - <xsl:with-param name="axis">t</xsl:with-param> - <xsl:with-param name="afterCodes">dg</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - - - - c - t - dgn - - - - - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="specialSubfieldSelect"> - <xsl:with-param name="anyCodes">tfklsv</xsl:with-param> - <xsl:with-param name="axis">t</xsl:with-param> - <xsl:with-param name="afterCodes">g</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - aqdc - t - gn - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">adfgklmorsv</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:value-of select="marc:subfield[@code='a']"></xsl:value-of> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="specialSubfieldSelect"> - <xsl:with-param name="anyCodes">tfklmorsv</xsl:with-param> - <xsl:with-param name="axis">t</xsl:with-param> - <xsl:with-param name="afterCodes">g</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - aq - t - g - - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="specialSubfieldSelect"> - <xsl:with-param name="anyCodes">tfklmorsv</xsl:with-param> - <xsl:with-param name="axis">t</xsl:with-param> - <xsl:with-param name="afterCodes">dg</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - - - - c - t - dgn - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="specialSubfieldSelect"> - <xsl:with-param name="anyCodes">tfklsv</xsl:with-param> - <xsl:with-param name="axis">t</xsl:with-param> - <xsl:with-param name="afterCodes">g</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - aqdc - t - gn - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">adfgklmorsv</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - - - isbn - - - - - - - - - - isrc - - - - - - - - - - ismn - - - - - - - - - - sici - - - - ab - - - - - - issn - - - - - - - - lccn - - - - - - - - - - issue number - matrix number - music plate - music publisher - videorecording identifier - - - - - - - ba - ab - - - - - - - - - - ab - - - - - - - - doi - hdl - uri - - - - - - - - - - - - - - - - - y3z - - - - - - - - - - - - - - - - - - - - - y3 - - - - - - - z - - - - - - - - - - - - - - - - - - abje - - - - - - - - abcd35 - - - - - - - abcde35 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - n - n - fgkdlmor - - - - - p - p - fgkdlmor - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - g - g - pst - - - - - p - p - fgkdlmor - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - cdn - - - - - - - - - - aq - - - - :,;/ - - - - - - - - - - acdeq - - - - - - constituent - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:value-of select="."></xsl:value-of> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:value-of select="."></xsl:value-of> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:value-of select="."></xsl:value-of> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:value-of select="."></xsl:value-of> - </xsl:with-param> - </xsl:call-template> - - - - - - - - - - - - - - - code - marcgac - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - lcsh - lcshac - mesh - - nal - csh - rvm - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - aq - - - - - - - - - - - - - - - - - - - - - - - - - - - - - cdnp - - - - - - - - - - - - - - - abcdeqnp - - - - - - - - - - - - - - - - - - - <xsl:call-template name="chopPunctuation"> - <xsl:with-param name="chopString"> - <xsl:call-template name="subfieldSelect"> - <xsl:with-param name="codes">adfhklor</xsl:with-param> - </xsl:call-template> - </xsl:with-param> - </xsl:call-template> - <xsl:call-template name="part"></xsl:call-template> - - - - - - - - - - - - - abcd - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - bc - - - - - - - - - - - - - - - - - - - - - - - - - - - yes - - - - - - - - - - - - - - - - - - - - - - - - - - - Arabic - Latin - Chinese, Japanese, Korean - Cyrillic - Hebrew - Greek - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - summary or subtitle - sung or spoken text - libretto - table of contents - accompanying material - translation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - summary or subtitle - sung or spoken text - libretto - table of contents - accompanying material - translation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .:,;/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
$$ WHERE name = 'mods32'; - --- Currently, the only difference from naco_normalize is that search_normalize --- turns apostrophes into spaces, while naco_normalize collapses them. -CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ - - use strict; - use Unicode::Normalize; - use Encode; - - my $str = decode_utf8(shift); - my $sf = shift; - - # Apply NACO normalization to input string; based on - # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf - # - # Note that unlike a strict reading of the NACO normalization rules, - # output is returned as lowercase instead of uppercase for compatibility - # with previous versions of the Evergreen naco_normalize routine. - - # Convert to upper-case first; even though final output will be lowercase, doing this will - # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. - # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. - $str = uc $str; - - # remove non-filing strings - $str =~ s/\x{0098}.*?\x{009C}//g; - - $str = NFKD($str); - - # additional substitutions - 3.6. - $str =~ s/\x{00C6}/AE/g; - $str =~ s/\x{00DE}/TH/g; - $str =~ s/\x{0152}/OE/g; - $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}][/DDOLl/d; - - # transformations based on Unicode category codes - $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; - - if ($sf && $sf =~ /^a/o) { - my $commapos = index($str, ','); - if ($commapos > -1) { - if ($commapos != length($str) - 1) { - $str =~ s/,/\x07/; # preserve first comma - } - } - } - - # since we've stripped out the control characters, we can now - # use a few as placeholders temporarily - $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; - $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; - $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; - - # decimal digits - $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; - - # intentionally skipping step 8 of the NACO algorithm; if the string - # gets normalized away, that's fine. - - # leading and trailing spaces - $str =~ s/\s+/ /g; - $str =~ s/^\s+//; - $str =~ s/\s+$//g; - - return lc $str; -$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; - -CREATE OR REPLACE FUNCTION public.search_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$ - SELECT public.search_normalize($1,'a'); -$func$ LANGUAGE SQL STRICT IMMUTABLE; - -CREATE OR REPLACE FUNCTION public.search_normalize( TEXT ) RETURNS TEXT AS $func$ - SELECT public.search_normalize($1,''); -$func$ LANGUAGE 'sql' STRICT IMMUTABLE; - -INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( - 'Search Normalize', - 'Apply search normalization rules to the extracted text. A less extreme version of NACO normalization.', - 'search_normalize', - 0 -); - -UPDATE config.metabib_field_index_norm_map - SET norm = ( - SELECT id FROM config.index_normalizer WHERE func = 'search_normalize' - ) - WHERE norm = ( - SELECT id FROM config.index_normalizer WHERE func = 'naco_normalize' - ) -; - - --- This could take a long time if you have a very non-English bib database --- Run it outside of a transaction to avoid lock escalation -SELECT metabib.reingest_metabib_field_entries(record) - FROM metabib.full_rec - WHERE tag = '245' - AND subfield = 'a' - AND value LIKE '%''%' -; - -COMMIT; - --- This is split out because it takes forever to run on large bib collections. -\qecho ************************************************************************ -\qecho The following transaction, wrapping upgrades 0679 and 0680, may take a -\qecho *really* long time, and you might be able to run it by itself in -\qecho parallel with other operations using a separate session. -\qecho ************************************************************************ - -BEGIN; -SELECT evergreen.upgrade_deps_block_check('0679', :eg_version); - --- Address typo in column name -ALTER TABLE config.metabib_class ADD COLUMN buoyant BOOL DEFAULT FALSE NOT NULL; -UPDATE config.metabib_class SET buoyant = bouyant; -ALTER TABLE config.metabib_class DROP COLUMN bouyant; - -CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ -DECLARE - normalizer RECORD; - value TEXT := ''; -BEGIN - - value := NEW.value; - - IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN - FOR normalizer IN - SELECT n.func AS func, - n.param_count AS param_count, - m.params AS params - FROM config.index_normalizer n - JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) - WHERE field = NEW.field AND m.pos < 0 - ORDER BY m.pos LOOP - EXECUTE 'SELECT ' || normalizer.func || '(' || - quote_literal( value ) || - CASE - WHEN normalizer.param_count > 0 - THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') - ELSE '' - END || - ')' INTO value; - - END LOOP; - - NEW.value := value; - END IF; - - IF NEW.index_vector = ''::tsvector THEN - RETURN NEW; - END IF; - - IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN - FOR normalizer IN - SELECT n.func AS func, - n.param_count AS param_count, - m.params AS params - FROM config.index_normalizer n - JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) - WHERE field = NEW.field AND m.pos >= 0 - ORDER BY m.pos LOOP - EXECUTE 'SELECT ' || normalizer.func || '(' || - quote_literal( value ) || - CASE - WHEN normalizer.param_count > 0 - THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') - ELSE '' - END || - ')' INTO value; - - END LOOP; - END IF; - - IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN - value := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(value, E'\\W+'), ' ' - ); - value := public.search_normalize(value); - END IF; - - NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); - - RETURN NEW; -END; -$$ LANGUAGE PLPGSQL; - --- Given a string such as a user might type into a search box, prepare --- two changed variants for TO_TSQUERY(). See --- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html --- The first variant is normalized to match indexed documents regardless --- of diacritics. The second variant keeps its diacritics for proper --- highlighting via TS_HEADLINE(). -CREATE OR REPLACE - FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS -$$ -DECLARE - orig_ended_in_space BOOLEAN; - result RECORD; - plain TEXT; - normalized TEXT; -BEGIN - orig_ended_in_space := orig ~ E'\\s$'; - - orig := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(orig, E'\\W+'), ' ' - ); - - normalized := public.search_normalize(orig); -- also trim()s - plain := trim(orig); - - IF NOT orig_ended_in_space THEN - plain := plain || ':*'; - normalized := normalized || ':*'; - END IF; - - plain := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(plain, E'\\s+'), ' & ' - ); - normalized := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & ' - ); - - RETURN ARRAY[normalized, plain]; -END; -$$ LANGUAGE PLPGSQL; - - --- Definition of OUT parameters changes, so must drop first -DROP FUNCTION IF EXISTS metabib.suggest_browse_entries (TEXT, TEXT, TEXT, INTEGER, INTEGER, INTEGER); - -CREATE OR REPLACE - FUNCTION metabib.suggest_browse_entries( - raw_query_text TEXT, -- actually typed by humans at the UI level - search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc - headline_opts TEXT, -- markup options for ts_headline() - visibility_org INTEGER,-- null if you don't want opac visibility test - query_limit INTEGER,-- use in LIMIT clause of interal query - normalization INTEGER -- argument to TS_RANK_CD() - ) RETURNS TABLE ( - value TEXT, -- plain - field INTEGER, - buoyant_and_class_match BOOL, - field_match BOOL, - field_weight INTEGER, - rank REAL, - buoyant BOOL, - match TEXT -- marked up - ) AS $func$ -DECLARE - prepared_query_texts TEXT[]; - query TSQUERY; - plain_query TSQUERY; - opac_visibility_join TEXT; - search_class_join TEXT; - r_fields RECORD; -BEGIN - prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); - - query := TO_TSQUERY('keyword', prepared_query_texts[1]); - plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); - - IF visibility_org IS NOT NULL THEN - opac_visibility_join := ' - JOIN asset.opac_visible_copies aovc ON ( - aovc.record = mbedm.source AND - aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) - )'; - ELSE - opac_visibility_join := ''; - END IF; - - -- The following determines whether we only provide suggestsons matching - -- the user's selected search_class, or whether we show other suggestions - -- too. The reason for MIN() is that for search_classes like - -- 'title|proper|uniform' you would otherwise get multiple rows. The - -- implication is that if title as a class doesn't have restrict, - -- nor does the proper field, but the uniform field does, you're going - -- to get 'false' for your overall evaluation of 'should we restrict?' - -- To invert that, change from MIN() to MAX(). - - SELECT - INTO r_fields - MIN(cmc.restrict::INT) AS restrict_class, - MIN(cmf.restrict::INT) AS restrict_field - FROM metabib.search_class_to_registered_components(search_class) - AS _registered (field_class TEXT, field INT) - JOIN - config.metabib_class cmc ON (cmc.name = _registered.field_class) - LEFT JOIN - config.metabib_field cmf ON (cmf.id = _registered.field); - - -- evaluate 'should we restrict?' - IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN - search_class_join := ' - JOIN - metabib.search_class_to_registered_components($2) - AS _registered (field_class TEXT, field INT) ON ( - (_registered.field IS NULL AND - _registered.field_class = cmf.field_class) OR - (_registered.field = cmf.id) - ) - '; - ELSE - search_class_join := ' - LEFT JOIN - metabib.search_class_to_registered_components($2) - AS _registered (field_class TEXT, field INT) ON ( - _registered.field_class = cmc.name - ) - '; - END IF; - - RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT - mbe.value, - cmf.id, - cmc.buoyant AND _registered.field_class IS NOT NULL, - _registered.field = cmf.id, - cmf.weight, - TS_RANK_CD(mbe.index_vector, $1, $6), - cmc.buoyant - FROM metabib.browse_entry_def_map mbedm - JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry) - JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) - JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) - ' || search_class_join || opac_visibility_join || - ' WHERE $1 @@ mbe.index_vector - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - LIMIT $5) x - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - ' -- sic, repeat the order by clause in the outer select too - USING - query, search_class, headline_opts, - visibility_org, query_limit, normalization, plain_query - ; - - -- sort order: - -- buoyant AND chosen class = match class - -- chosen field = match field - -- field weight - -- rank - -- buoyancy - -- value itself - -END; -$func$ LANGUAGE PLPGSQL; - - -\qecho -\qecho The following takes about a minute per 100,000 rows in -\qecho metabib.browse_entry on my development system, which is only a VM with -\qecho 4 GB of memory and 2 cores. -\qecho -\qecho The following is a very loose estimate of how long the next UPDATE -\qecho statement would take to finish on MY machine, based on YOUR number -\qecho of rows in metabib.browse_entry: -\qecho - -SELECT (COUNT(id) / 100000.0) * INTERVAL '1 minute' - AS "approximate duration of following UPDATE statement" - FROM metabib.browse_entry; - -UPDATE metabib.browse_entry SET index_vector = TO_TSVECTOR( - 'keyword', - public.search_normalize( - ARRAY_TO_STRING( - evergreen.regexp_split_to_array(value, E'\\W+'), ' ' - ) - ) -); - - -SELECT evergreen.upgrade_deps_block_check('0680', :eg_version); - --- Not much use in having identifier-class fields be suggestions. Credit for the idea goes to Ben Shum. -UPDATE config.metabib_field SET browse_field = FALSE WHERE id < 100 AND field_class = 'identifier'; - - ---------------------------------------------------------------------------- --- The rest of this was tested on Evergreen Indiana's dev server, which has --- a large data set of 2.6M bibs, and was instrumental in sussing out the --- needed adjustments. Thanks, EG-IN! ---------------------------------------------------------------------------- - --- GIN indexes are /much/ better for prefix matching, which is important for browse and autosuggest ---Commented out the creation earlier, so we don't need to drop it here. ---DROP INDEX metabib.metabib_browse_entry_index_vector_idx; -CREATE INDEX metabib_browse_entry_index_vector_idx ON metabib.browse_entry USING GIN (index_vector); - - --- We need thes to make the autosuggest limiting joins fast -CREATE INDEX browse_entry_def_map_def_idx ON metabib.browse_entry_def_map (def); -CREATE INDEX browse_entry_def_map_entry_idx ON metabib.browse_entry_def_map (entry); -CREATE INDEX browse_entry_def_map_source_idx ON metabib.browse_entry_def_map (source); - --- In practice this will always be ~1 row, and the default of 1000 causes terrible plans -ALTER FUNCTION metabib.search_class_to_registered_components(text) ROWS 1; - --- Reworking of the generated query to act in a sane manner in the face of large datasets -CREATE OR REPLACE - FUNCTION metabib.suggest_browse_entries( - raw_query_text TEXT, -- actually typed by humans at the UI level - search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc - headline_opts TEXT, -- markup options for ts_headline() - visibility_org INTEGER,-- null if you don't want opac visibility test - query_limit INTEGER,-- use in LIMIT clause of interal query - normalization INTEGER -- argument to TS_RANK_CD() - ) RETURNS TABLE ( - value TEXT, -- plain - field INTEGER, - buoyant_and_class_match BOOL, - field_match BOOL, - field_weight INTEGER, - rank REAL, - buoyant BOOL, - match TEXT -- marked up - ) AS $func$ -DECLARE - prepared_query_texts TEXT[]; - query TSQUERY; - plain_query TSQUERY; - opac_visibility_join TEXT; - search_class_join TEXT; - r_fields RECORD; -BEGIN - prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); - - query := TO_TSQUERY('keyword', prepared_query_texts[1]); - plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); - - IF visibility_org IS NOT NULL THEN - opac_visibility_join := ' - JOIN asset.opac_visible_copies aovc ON ( - aovc.record = x.source AND - aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) - )'; - ELSE - opac_visibility_join := ''; - END IF; - - -- The following determines whether we only provide suggestsons matching - -- the user's selected search_class, or whether we show other suggestions - -- too. The reason for MIN() is that for search_classes like - -- 'title|proper|uniform' you would otherwise get multiple rows. The - -- implication is that if title as a class doesn't have restrict, - -- nor does the proper field, but the uniform field does, you're going - -- to get 'false' for your overall evaluation of 'should we restrict?' - -- To invert that, change from MIN() to MAX(). - - SELECT - INTO r_fields - MIN(cmc.restrict::INT) AS restrict_class, - MIN(cmf.restrict::INT) AS restrict_field - FROM metabib.search_class_to_registered_components(search_class) - AS _registered (field_class TEXT, field INT) - JOIN - config.metabib_class cmc ON (cmc.name = _registered.field_class) - LEFT JOIN - config.metabib_field cmf ON (cmf.id = _registered.field); - - -- evaluate 'should we restrict?' - IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN - search_class_join := ' - JOIN - metabib.search_class_to_registered_components($2) - AS _registered (field_class TEXT, field INT) ON ( - (_registered.field IS NULL AND - _registered.field_class = cmf.field_class) OR - (_registered.field = cmf.id) - ) - '; - ELSE - search_class_join := ' - LEFT JOIN - metabib.search_class_to_registered_components($2) - AS _registered (field_class TEXT, field INT) ON ( - _registered.field_class = cmc.name - ) - '; - END IF; - - RETURN QUERY EXECUTE ' -SELECT DISTINCT - x.value, - x.id, - x.push, - x.restrict, - x.weight, - x.ts_rank_cd, - x.buoyant, - TS_HEADLINE(value, $7, $3) - FROM (SELECT DISTINCT - mbe.value, - cmf.id, - cmc.buoyant AND _registered.field_class IS NOT NULL AS push, - _registered.field = cmf.id AS restrict, - cmf.weight, - TS_RANK_CD(mbe.index_vector, $1, $6), - cmc.buoyant, - mbedm.source - FROM metabib.browse_entry_def_map mbedm - - -- Start with a pre-limited set of 10k possible suggestions. More than that is not going to be useful anyway - JOIN (SELECT * FROM metabib.browse_entry WHERE index_vector @@ $1 LIMIT 10000) mbe ON (mbe.id = mbedm.entry) - - JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) - JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) - ' || search_class_join || ' - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - LIMIT 1000) AS x -- This outer limit makes testing for opac visibility usably fast - ' || opac_visibility_join || ' - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - LIMIT $5 -' -- sic, repeat the order by clause in the outer select too - USING - query, search_class, headline_opts, - visibility_org, query_limit, normalization, plain_query - ; - - -- sort order: - -- buoyant AND chosen class = match class - -- chosen field = match field - -- field weight - -- rank - -- buoyancy - -- value itself - -END; -$func$ LANGUAGE PLPGSQL; - -COMMIT; - -- This is split out because it was backported to 2.1, but may not exist before upgrades -- It can safely fail -- Also, lets say that. <_<