Improvements to search tuning

author miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>

Mon, 18 Apr 2011 19:57:01 +0000 (19:57 +0000)

committer miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>

Mon, 18 Apr 2011 19:57:01 +0000 (19:57 +0000)
author miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Mon, 18 Apr 2011 19:57:01 +0000 (19:57 +0000)
committer miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Mon, 18 Apr 2011 19:57:01 +0000 (19:57 +0000)
diff --git a/Open-ILS/examples/opensrf.xml.example b/Open-ILS/examples/opensrf.xml.example

index b06817f..a5d5a79 100644 (file)
--- a/Open-ILS/examples/opensrf.xml.example
+++ b/Open-ILS/examples/opensrf.xml.example
@@ -476,6 +476,65 @@ vim:et:ts=4:sw=4:
                      -->
                      <estimation_strategy>inclusion</estimation_strategy>
  
+                    <!--
+                        Evergreen uses a cover density algorithm for calculating relative ranking of matches.  There
+                        are several tuning parameters and options available.  By default, no document length normalization
+                        is applied.  From the Postgres documentation on ts_rank_cd() (the function used by Evergreen):
+
+                            Since a longer document has a greater chance of containing a query term it is reasonable 
+                            to take into account document size, e.g., a hundred-word document with five instances of 
+                            a search word is probably more relevant than a thousand-word document with five instances. 
+                            Both ranking functions take an integer normalization option that specifies whether and how 
+                            a document's length should impact its rank. The integer option controls several behaviors, 
+                            so it is a bit mask: you can specify one or more behaviors using | (for example, 2|4).
+
+                                0 (the default) ignores the document length
+
+                                1 divides the rank by 1 + the logarithm of the document length
+
+                                2 divides the rank by the document length
+
+                                4 divides the rank by the mean harmonic distance between extents (this is implemented only by ts_rank_cd)
+
+                                8 divides the rank by the number of unique words in document
+
+                                16 divides the rank by 1 + the logarithm of the number of unique words in document
+
+                                32 divides the rank by itself + 1
+
+                            If more than one flag bit is specified, the transformations are applied in the order listed.
+
+                            It is important to note that the ranking functions do not use any global information, so it 
+                            is impossible to produce a fair normalization to 1% or 100% as sometimes desired. Normalization 
+                            option 32 (rank/(rank+1)) can be applied to scale all ranks into the range zero to one, but of 
+                            course this is just a cosmetic change; it will not affect the ordering of the search results.
+
+                        In Evergreen, these options are set via search modifiers.  The modifiers are mapped in the
+                        following way:
+
+                            * #CD_logDocumentLength  => 1  :: rank / (1 + LOG(total_word_count))   :: Longer documents slightly less relevant
+                            * #CD_documentLength     => 2  :: rank / total_word_count              :: Longer documents much less relevant
+                            * #CD_meanHarmonic       => 4  :: Word Proximity                       :: Greater matched-word distance is less relevant
+                            * #CD_uniqueWords        => 8  :: rank / unique_word_count             :: Documents with repeated words much less relevant
+                            * #CD_logUniqueWords     => 16 :: rank / (1 + LOG(unique_word_count))  :: Documents with repeated words slightly less relevant
+                            * #CD_selfPlusOne        => 32 :: rank / (1 + rank)                    :: Cosmetic normalization of rank value between 0 and 1
+
+                        Adding one or more of these to the default_CD_modifiers list will cause all searches that use QueryParser to apply them.
+                    -->
+                    <default_CD_modifiers>#CD_documentLength #CD_meanHarmonic #CD_uniqueWords</default_CD_modifiers>
+
+                    <!--
+                        default_preferred_language
+                            Set the global, default preferred languange
+                    -->
+                    <default_preferred_language>eng</default_preferred_language>
+
+                    <!--
+                        default_preferred_language_weight
+                            Set the weight (higher is "better") for the preferred language. Comment out to remove all lanuage weighting by default.
+                    -->
+                    <default_preferred_language_weight>5</default_preferred_language_weight>
+
                      <!-- Baseline number of records to check for hit estimation. -->
                      <superpage_size>1000</superpage_size>
  
@@ -798,12 +857,6 @@ vim:et:ts=4:sw=4:
                      <max_spare_children>5</max_spare_children>
                  </unix_config>
                  <app_settings>
-                    <!-- default_preferred_language: Set the global, default preferred languange -->
-                    <default_preferred_language>eng</default_preferred_language>
-
-                    <!-- default_preferred_language_weight: Set the weight (higher is "better") for the preferred language -->
-                    <default_preferred_language_weight>5</default_preferred_language_weight>
-
                      <script_path>LIBDIR/javascript/</script_path>
                      <script_path>LOCALSTATEDIR/catalog/</script_path>
                      <scripts>
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm

index f207eb0..cef9cca 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm
@@ -894,7 +894,7 @@ sub rank {
      }
  
      return $self->{rank} if ($self->{rank});
-    return $self->{rank} = 'rank_cd(' . $self->table_alias . '.index_vector, ' . $self->table_alias . ".tsq, $cover_density)";
+    return $self->{rank} = 'ts_rank_cd(' . $self->table_alias . '.index_vector, ' . $self->table_alias . ".tsq, $cover_density)";
  }
  
  
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/fts.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/fts.pm

index 055266a..2857e2a 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/fts.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/fts.pm
@@ -60,7 +60,7 @@
                 my @ranks;
                 for my $fts ( $self->fts_query ) {
                         push @output, join(' ', $self->fts_col, $self->{fts_op}, $fts);
-                       push @ranks, "rank($column, $fts)";
+                       push @ranks, "ts_rank($column, $fts)";
                 }
                 $self->{fts_rank} = \@ranks;
         
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm

index a2cc36b..0b146a0 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm
@@ -2368,11 +2368,13 @@ sub staged_fts {
  
      }
  
+    my $config = OpenSRF::Utils::SettingsClient->new();
+
      if (!$default_preferred_language) {
  
-        $default_preferred_language = OpenSRF::Utils::SettingsClient
-            ->new
-            ->config_value(
+        $default_preferred_language = $config->config_value(
+                apps => 'open-ils.search' => app_settings => 'default_preferred_language'
+        ) || $config->config_value(
                  apps => 'open-ils.storage' => app_settings => 'default_preferred_language'
          );
  
@@ -2380,12 +2382,11 @@ sub staged_fts {
  
      if (!$default_preferred_language_weight) {
  
-        $default_preferred_language_weight = OpenSRF::Utils::SettingsClient
-            ->new
-            ->config_value(
+        $default_preferred_language_weight = $config->config_value(
+                apps => 'open-ils.storage' => app_settings => 'default_preferred_language_weight'
+        ) || $config->config_value(
                  apps => 'open-ils.storage' => app_settings => 'default_preferred_language_weight'
          );
-
      }
  
      # inclusion, exclusion, delete_adjusted_inclusion, delete_adjusted_exclusion
@@ -2842,6 +2843,11 @@ sub query_parser_fts {
                 die "No query was passed to ".$self->api_name;
         }
  
+    my $default_CD_modifiers = OpenSRF::Utils::SettingsClient->new->config_value(
+        apps => 'open-ils.search' => app_settings => 'default_CD_modifiers'
+    );
+    $args{query} = "$default_CD_modifiers $args{query}" if ($default_CD_modifiers);
+
  
      my $simple_plan = $args{_simple_plan};
      # remove bad chunks of the %args hash
@@ -2854,31 +2860,45 @@ sub query_parser_fts {
      # we expect, and make use of, query, superpage, superpage_size, debug and core_limit args
      my $query = $parser->new( %args )->parse;
  
+    my $config = OpenSRF::Utils::SettingsClient->new();
  
      # set the locale-based default prefered location
      if (!$query->parse_tree->find_filter('preferred_language')) {
          $parser->default_preferred_language( $args{preferred_language} );
+
          if (!$parser->default_preferred_language) {
                     my $ses_locale = $client->session ? $client->session->session_locale : '';
              $parser->default_preferred_language( $locale_map{ lc($ses_locale) } );
          }
-        $parser->default_preferred_language(
-            OpenSRF::Utils::SettingsClient->new->config_value(
+
+        if (!$parser->default_preferred_language) { # still nothing...
+            my $tmp_dpl = $config->config_value(
+                apps => 'open-ils.search' => app_settings => 'default_preferred_language'
+            ) || $config->config_value(
                  apps => 'open-ils.storage' => app_settings => 'default_preferred_language'
-            )
-        ) if (!$parser->default_preferred_language);
+            );
+
+            $parser->default_preferred_language( $tmp_dpl )
+        }
      }
  
  
      # set the global default language multiplier
      if (!$query->parse_tree->find_filter('preferred_language_weight') and !$query->parse_tree->find_filter('preferred_language_multiplier')) {
-        $parser->default_preferred_language_multiplier($args{preferred_language_weight});
-        $parser->default_preferred_language_multiplier($args{preferred_language_multiplier});
-        $parser->default_preferred_language_multiplier(
-            OpenSRF::Utils::SettingsClient->new->config_value(
+        my $tmp_dplw;
+
+        if ($tmp_dplw = $args{preferred_language_weight} || $args{preferred_language_multiplier} ) {
+            $parser->default_preferred_language_multiplier($tmp_dplw);
+
+        } else {
+            $tmp_dplw = $config->config_value(
+                apps => 'open-ils.search' => app_settings => 'default_preferred_language_weight'
+            ) || $config->config_value(
                  apps => 'open-ils.storage' => app_settings => 'default_preferred_language_weight'
-            )
-        ) if (!$parser->default_preferred_language_multiplier);
+            );
+
+            $parser->default_preferred_language_multiplier( $tmp_dplw );
+        }
      }
  
      # gather the site, if one is specified, defaulting to the in-query version
author	miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
	Mon, 18 Apr 2011 19:57:01 +0000 (19:57 +0000)
committer	miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
	Mon, 18 Apr 2011 19:57:01 +0000 (19:57 +0000)
Open-ILS/examples/opensrf.xml.example		patch \| blob \| history
Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm		patch \| blob \| history
Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Driver/Pg/fts.pm		patch \| blob \| history
Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm		patch \| blob \| history