File "class-query-parser.php"
Full Path: /home/warrior1/public_html/languages/wp-content-20241001222009/plugins/jetpack/jetpack_vendor/automattic/jetpack-search/src/wpes/class-query-parser.php
File size: 18.49 KB
MIME-type: text/x-php
Charset: utf-8
<?php
/**
* Parse a pure text query into WordPress Elasticsearch query. This builds on
* the Query_Builder() to provide search query parsing.
*
* The key part of this parser is taking a user's query string typed into a box
* and converting it into an ES search query.
*
* This varies by application, but roughly it means extracting some parts of the query
* (authors, tags, and phrases) that are treated as a filter. Then taking the
* remaining words and building the correct query (possibly with prefix searching
* if we are doing search as you type)
*
* This class only supports ES 2.x+
*
* Disables comment chehcks.
* phpcs:disable Squiz.Commenting
*
* This parser builds queries of the form:
* bool:
* must:
* AND match of a single field (ideally an edgengram field)
* filter:
* filter clauses from context (eg @gibrown, #news, etc)
* should:
* boosting of results by various fields
*
* Features supported:
* - search as you type
* - phrases
* - supports querying across multiple languages at once
*
* @package automattic/jetpack-search
*/
namespace Automattic\Jetpack\Search\WPES;
/**
* Query parser class.
*/
class Query_Parser extends Query_Builder {
protected $orig_query = '';
protected $current_query = '';
protected $langs;
protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );
public function __construct( $user_query, $langs ) {
$this->orig_query = $user_query;
$this->current_query = $this->orig_query;
$this->langs = $this->norm_langs( $langs );
}
protected $extracted_phrases = array();
public function get_current_query() {
return $this->current_query;
}
public function set_current_query( $q ) {
$this->current_query = $q;
}
///////////////////////////////////////////////////////
// Methods for Building arrays of multilingual fields
/*
* Normalize language codes
*/
public function norm_langs( $langs ) {
$lst = array();
foreach ( $langs as $l ) {
$l = strtok( $l, '-_' );
if ( in_array( $l, $this->avail_langs, true ) ) {
$lst[ $l ] = true;
} else {
$lst['default'] = true;
}
}
return array_keys( $lst );
}
public function get_lang_field_suffix() {
if ( ! is_array( $this->langs ) || empty( $this->langs ) ) {
return;
}
// Returns the first language only
return $this->langs[0];
}
/*
* Take a list of field prefixes and expand them for multi-lingual
* with the provided boostings.
*/
public function merge_ml_fields( $fields2boosts, $additional_fields ) {
$flds = array();
foreach ( $fields2boosts as $f => $b ) {
foreach ( $this->langs as $l ) {
$flds[] = $f . '.' . $l . '^' . $b;
}
}
foreach ( $additional_fields as $f ) {
$flds[] = $f;
}
return $flds;
}
////////////////////////////////////
// Extract Fields for Filtering on
/*
* Extract any @mentions from the user query
* use them as a filter if we can find a wp.com id
* otherwise use them as a
*
* args:
* wpcom_id_field: wp.com id field
* must_query_fields: array of fields to search for matching results (optional)
* boost_query_fields: array of fields to search in for boosting results (optional)
* prefixes: array of prefixes that the user can use to indicate an author
*
* returns true/false of whether any were found
*
* See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
*/
public function author_field_filter( $args ) {
$defaults = array(
'wpcom_id_field' => 'author_id',
'must_query_fields' => null,
'boost_query_fields' => null,
'prefixes' => array( '@' ),
);
$args = wp_parse_args( $args, $defaults );
$names = array();
foreach ( $args['prefixes'] as $p ) {
$found = $this->get_fields( $p );
if ( $found ) {
foreach ( $found as $f ) {
$names[] = $f;
}
}
}
if ( empty( $names ) ) {
return false;
}
foreach ( $args['prefixes'] as $p ) {
$this->remove_fields( $p );
}
$user_ids = array();
//loop through the matches and separate into filters and queries
foreach ( $names as $n ) {
//check for exact match on login
$userdata = get_user_by( 'login', strtolower( $n ) );
$filtering = false;
if ( $userdata ) {
$user_ids[ $userdata->ID ] = true;
$filtering = true;
}
$is_phrase = false;
if ( preg_match( '/"/', $n ) ) {
$is_phrase = true;
$n = preg_replace( '/"/', '', $n );
}
if ( ! empty( $args['must_query_fields'] ) && ! $filtering ) {
if ( $is_phrase ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $n,
'type' => 'phrase',
),
)
);
} else {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $n,
),
)
);
}
}
if ( ! empty( $args['boost_query_fields'] ) ) {
if ( $is_phrase ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $n,
'type' => 'phrase',
),
),
'should'
);
} else {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $n,
),
),
'should'
);
}
}
}
if ( ! empty( $user_ids ) ) {
$user_ids = array_keys( $user_ids );
$this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
}
return true;
}
/*
* Extract any prefix followed by text use them as a must clause,
* and optionally as a boost to the should query
* This can be used for hashtags. eg #News, or #"current events",
* but also works for any arbitrary field. eg from:Greg
*
* args:
* must_query_fields: array of fields that must match the tag (optional)
* boost_query_fields: array of fields to boost search on (optional)
* prefixes: array of prefixes that the user can use to indicate a tag
*
* returns true/false of whether any were found
*
*/
public function text_field_filter( $args ) {
$defaults = array(
'must_query_fields' => array( 'tag.name' ),
'boost_query_fields' => array( 'tag.name' ),
'prefixes' => array( '#' ),
);
$args = wp_parse_args( $args, $defaults );
$tags = array();
foreach ( $args['prefixes'] as $p ) {
$found = $this->get_fields( $p );
if ( $found ) {
foreach ( $found as $f ) {
$tags[] = $f;
}
}
}
if ( empty( $tags ) ) {
return false;
}
foreach ( $args['prefixes'] as $p ) {
$this->remove_fields( $p );
}
foreach ( $tags as $t ) {
$is_phrase = false;
if ( preg_match( '/"/', $t ) ) {
$is_phrase = true;
$t = preg_replace( '/"/', '', $t );
}
if ( ! empty( $args['must_query_fields'] ) ) {
if ( $is_phrase ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $t,
'type' => 'phrase',
),
)
);
} else {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $t,
),
)
);
}
}
if ( ! empty( $args['boost_query_fields'] ) ) {
if ( $is_phrase ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $t,
'type' => 'phrase',
),
),
'should'
);
} else {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $t,
),
),
'should'
);
}
}
}
return true;
}
/*
* Extract anything surrounded by quotes or if there is an opening quote
* that is not complete, and add them to the query as a phrase query.
* Quotes can be either '' or ""
*
* args:
* must_query_fields: array of fields that must match the phrases
* boost_query_fields: array of fields to boost the phrases on (optional)
*
* returns true/false of whether any were found
*
*/
public function phrase_filter( $args ) {
$defaults = array(
'must_query_fields' => array( 'all_content' ),
'boost_query_fields' => array( 'title' ),
);
$args = wp_parse_args( $args, $defaults );
$phrases = array();
if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
foreach ( $matches[1] as $match ) {
$phrases[] = $match;
}
$this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
}
if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
foreach ( $matches[1] as $match ) {
$phrases[] = $match;
}
$this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
}
//look for a final, uncompleted phrase
$phrase_prefix = false;
if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
$phrase_prefix = $matches[1][0];
$this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
}
if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) {
$phrase_prefix = $matches[1][0];
$this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query );
}
if ( $phrase_prefix ) {
$phrases[] = $phrase_prefix;
}
if ( empty( $phrases ) ) {
return false;
}
foreach ( $phrases as $p ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $p,
'type' => 'phrase',
),
)
);
if ( ! empty( $args['boost_query_fields'] ) ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $p,
'operator' => 'and',
),
),
'should'
);
}
}
return true;
}
/*
* Query fields based on the remaining parts of the query
* This could be the final AND part of the query terms to match, or it
* could be boosting certain elements of the query
*
* args:
* must_query_fields: array of fields that must match the remaining terms (optional)
* boost_query_fields: array of fields to boost the remaining terms on (optional)
*
*/
public function remaining_query( $args ) {
$defaults = array(
'must_query_fields' => null,
'boost_query_fields' => null,
'boost_operator' => 'and',
'boost_query_type' => 'best_fields',
);
$args = wp_parse_args( $args, $defaults );
if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
return;
}
if ( ! empty( $args['must_query_fields'] ) ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $this->current_query,
'operator' => 'and',
),
)
);
}
if ( ! empty( $args['boost_query_fields'] ) ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $this->current_query,
'operator' => $args['boost_operator'],
'type' => $args['boost_query_type'],
),
),
'should'
);
}
}
/*
* Query fields using a prefix query (alphabetical expansions on the index).
* This is not recommended. Slower performance and worse relevancy.
*
* (UNTESTED! Copied from old prefix expansion code)
*
* args:
* must_query_fields: array of fields that must match the remaining terms (optional)
* boost_query_fields: array of fields to boost the remaining terms on (optional)
*
*/
public function remaining_prefix_query( $args ) {
$defaults = array(
'must_query_fields' => array( 'all_content' ),
'boost_query_fields' => array( 'title' ),
'boost_operator' => 'and',
'boost_query_type' => 'best_fields',
);
$args = wp_parse_args( $args, $defaults );
if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
return;
}
//////////////////////////////////
// Example cases to think about:
// "elasticse"
// "elasticsearch"
// "elasticsearch "
// "elasticsearch lucen"
// "elasticsearch lucene"
// "the future" - note the stopword which will match nothing!
// "F1" - an exact match that also has tons of expansions
// "こんにちは" ja "hello"
// "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
// - this could still be better I bet. Maybe we need to analyze with ES first?
//
/////////////////////////////
//extract pieces of query
// eg: "PREFIXREMAINDER PREFIXWORD"
// "elasticsearch lucen"
$prefix_word = false;
$prefix_remainder = false;
if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) {
$prefix_word = $matches[1][0];
}
$prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
if ( ctype_space( $prefix_remainder ) ) {
$prefix_remainder = false;
}
if ( ! $prefix_word ) {
//Space at the end of the query, so skip using a prefix query
if ( ! empty( $args['must_query_fields'] ) ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['must_query_fields'],
'query' => $this->current_query,
'operator' => 'and',
),
)
);
}
if ( ! empty( $args['boost_query_fields'] ) ) {
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $this->current_query,
'operator' => $args['boost_operator'],
'type' => $args['boost_query_type'],
),
),
'should'
);
}
} else {
//must match the prefix word and the prefix remainder
if ( ! empty( $args['must_query_fields'] ) ) {
//need to do an OR across a few fields to handle all cases
$must_q = array(
'bool' => array(
'should' => array(),
'minimum_should_match' => 1,
),
);
//treat all words as an exact search (boosts complete word like "news"
//from prefixes of "newspaper")
$must_q['bool']['should'][] = array(
'multi_match' => array(
'fields' => $this->all_fields,
// NOTE: This line has been disabled since $full_text is not available.
// 'query' => $full_text,
'operator' => 'and',
'type' => 'cross_fields',
),
);
//always optimistically try and match the full text as a phrase
//prefix "the futu" should try to match "the future"
//otherwise the first stopword kinda breaks
//This also works as the prefix match for a single word "elasticsea"
$must_q['bool']['should'][] = array(
'multi_match' => array(
'fields' => $this->phrase_fields,
// NOTE: This line has been disabled since $full_text is not available.
// 'query' => $full_text,
'operator' => 'and',
'type' => 'phrase_prefix',
'max_expansions' => 100,
),
);
if ( $prefix_remainder ) {
//Multiple words found, so treat each word on its own and not just as
//a part of a phrase
//"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
$must_q['bool']['should'][] = array(
'bool' => array(
'must' => array(
array(
'multi_match' => array(
'fields' => $this->phrase_fields,
'query' => $prefix_word,
'operator' => 'and',
'type' => 'phrase_prefix',
'max_expansions' => 100,
),
),
array(
'multi_match' => array(
'fields' => $this->all_fields,
'query' => $prefix_remainder,
'operator' => 'and',
'type' => 'cross_fields',
),
),
),
),
);
}
$this->add_query( $must_q );
}
//Now add any boosting of the query
if ( ! empty( $args['boost_query_fields'] ) ) {
//treat all words as an exact search (boosts complete word like "news"
//from prefixes of "newspaper")
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $this->current_query,
'operator' => $args['boost_query_operator'],
'type' => $args['boost_query_type'],
),
),
'should'
);
//optimistically boost the full phrase prefix match
$this->add_query(
array(
'multi_match' => array(
'fields' => $args['boost_query_fields'],
'query' => $this->current_query,
'operator' => 'and',
'type' => 'phrase_prefix',
'max_expansions' => 100,
),
)
);
}
}
}
/*
* Boost results based on the lang probability overlaps
*
* args:
* langs2prob: list of languages to search in with associated boosts
*/
public function boost_lang_probs( $langs2prob ) {
foreach ( $langs2prob as $p ) {
$this->add_function(
'field_value_factor',
array(
'modifier' => 'none',
'factor' => $p,
'missing' => 0.01, //1% chance doc did not have right lang detected
)
);
}
}
////////////////////////////////////
// Helper Methods
//Get the text after some prefix. eg @gibrown, or @"Greg Brown"
protected function get_fields( $field_prefix ) {
$regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
if ( preg_match_all( $regex, $this->current_query, $match ) ) {
return $match[1];
}
return false;
}
//Remove the prefix and text from the query
protected function remove_fields( $field_name ) {
$regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
$this->current_query = preg_replace( $regex, '', $this->current_query );
}
//Best effort string truncation that splits on word breaks
protected function truncate_string( $string, $limit, $break = ' ' ) {
if ( mb_strwidth( $string ) <= $limit ) {
return $string;
}
// walk backwards from $limit to find first break
$breakpoint = $limit;
$broken = false;
while ( $breakpoint > 0 ) {
if ( mb_strimwidth( $string, $breakpoint, 1 ) === $break ) {
$string = mb_strimwidth( $string, 0, $breakpoint );
$broken = true;
break;
}
$breakpoint--;
}
// if we weren't able to find a break, need to chop mid-word
if ( ! $broken ) {
$string = mb_strimwidth( $string, 0, $limit );
}
return $string;
}
}