primogenial

File "class-query-parser.php"
Full Path: /home/warrior1/public_html/languages/wp-content-20241001222009/plugins/jetpack/jetpack_vendor/automattic/jetpack-search/src/wpes/class-query-parser.php
File size: 18.49 KB
MIME-type: text/x-php
Charset: utf-8
Open Edit Advanced Editor Back
<?php
/**
 * Parse a pure text query into WordPress Elasticsearch query. This builds on
 * the Query_Builder() to provide search query parsing.
 *
 * The key part of this parser is taking a user's query string typed into a box
 * and converting it into an ES search query.
 *
 * This varies by application, but roughly it means extracting some parts of the query
 * (authors, tags, and phrases) that are treated as a filter. Then taking the
 * remaining words and building the correct query (possibly with prefix searching
 * if we are doing search as you type)
 *
 * This class only supports ES 2.x+
 *
 * Disables comment chehcks.
 * phpcs:disable Squiz.Commenting
 *
 * This parser builds queries of the form:
 *   bool:
 *     must:
 *       AND match of a single field (ideally an edgengram field)
 *     filter:
 *       filter clauses from context (eg @gibrown, #news, etc)
 *     should:
 *       boosting of results by various fields
 *
 * Features supported:
 *  - search as you type
 *  - phrases
 *  - supports querying across multiple languages at once
 *
 * @package    automattic/jetpack-search
 */

namespace Automattic\Jetpack\Search\WPES;

/**
 * Query parser class.
 */
class Query_Parser extends Query_Builder {
	protected $orig_query    = '';
	protected $current_query = '';
	protected $langs;
	protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );

	public function __construct( $user_query, $langs ) {
		$this->orig_query    = $user_query;
		$this->current_query = $this->orig_query;
		$this->langs         = $this->norm_langs( $langs );
	}

	protected $extracted_phrases = array();

	public function get_current_query() {
		return $this->current_query;
	}

	public function set_current_query( $q ) {
		$this->current_query = $q;
	}

	///////////////////////////////////////////////////////
	// Methods for Building arrays of multilingual fields

	/*
	 * Normalize language codes
	 */
	public function norm_langs( $langs ) {
		$lst = array();
		foreach ( $langs as $l ) {
			$l = strtok( $l, '-_' );
			if ( in_array( $l, $this->avail_langs, true ) ) {
				$lst[ $l ] = true;
			} else {
				$lst['default'] = true;
			}
		}
		return array_keys( $lst );
	}

	public function get_lang_field_suffix() {
		if ( ! is_array( $this->langs ) || empty( $this->langs ) ) {
			return;
		}

		// Returns the first language only
		return $this->langs[0];
	}

	/*
	 * Take a list of field prefixes and expand them for multi-lingual
	 * with the provided boostings.
	 */
	public function merge_ml_fields( $fields2boosts, $additional_fields ) {
		$flds = array();
		foreach ( $fields2boosts as $f => $b ) {
			foreach ( $this->langs as $l ) {
				$flds[] = $f . '.' . $l . '^' . $b;
			}
		}
		foreach ( $additional_fields as $f ) {
			$flds[] = $f;
		}
		return $flds;
	}

	////////////////////////////////////
	// Extract Fields for Filtering on

	/*
	 * Extract any @mentions from the user query
	 *  use them as a filter if we can find a wp.com id
	 *  otherwise use them as a
	 *
	 *  args:
	 *    wpcom_id_field: wp.com id field
	 *    must_query_fields: array of fields to search for matching results (optional)
	 *    boost_query_fields: array of fields to search in for boosting results (optional)
	 *    prefixes: array of prefixes that the user can use to indicate an author
	 *
	 *  returns true/false of whether any were found
	 *
	 * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
	 */
	public function author_field_filter( $args ) {
		$defaults = array(
			'wpcom_id_field'     => 'author_id',
			'must_query_fields'  => null,
			'boost_query_fields' => null,
			'prefixes'           => array( '@' ),
		);
		$args     = wp_parse_args( $args, $defaults );

		$names = array();
		foreach ( $args['prefixes'] as $p ) {
			$found = $this->get_fields( $p );
			if ( $found ) {
				foreach ( $found as $f ) {
					$names[] = $f;
				}
			}
		}

		if ( empty( $names ) ) {
			return false;
		}

		foreach ( $args['prefixes'] as $p ) {
			$this->remove_fields( $p );
		}

		$user_ids = array();

		//loop through the matches and separate into filters and queries
		foreach ( $names as $n ) {
			//check for exact match on login
			$userdata  = get_user_by( 'login', strtolower( $n ) );
			$filtering = false;
			if ( $userdata ) {
				$user_ids[ $userdata->ID ] = true;
				$filtering                 = true;
			}

			$is_phrase = false;
			if ( preg_match( '/"/', $n ) ) {
				$is_phrase = true;
				$n         = preg_replace( '/"/', '', $n );
			}

			if ( ! empty( $args['must_query_fields'] ) && ! $filtering ) {
				if ( $is_phrase ) {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['must_query_fields'],
								'query'  => $n,
								'type'   => 'phrase',
							),
						)
					);
				} else {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['must_query_fields'],
								'query'  => $n,
							),
						)
					);
				}
			}

			if ( ! empty( $args['boost_query_fields'] ) ) {
				if ( $is_phrase ) {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['boost_query_fields'],
								'query'  => $n,
								'type'   => 'phrase',
							),
						),
						'should'
					);
				} else {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['boost_query_fields'],
								'query'  => $n,
							),
						),
						'should'
					);
				}
			}
		}

		if ( ! empty( $user_ids ) ) {
			$user_ids = array_keys( $user_ids );
			$this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
		}

		return true;
	}

	/*
	 * Extract any prefix followed by text use them as a must clause,
	 *   and optionally as a boost to the should query
	 *   This can be used for hashtags. eg #News, or #"current events",
	 *   but also works for any arbitrary field. eg from:Greg
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the tag (optional)
	 *    boost_query_fields: array of fields to boost search on (optional)
	 *    prefixes: array of prefixes that the user can use to indicate a tag
	 *
	 *  returns true/false of whether any were found
	 *
	 */
	public function text_field_filter( $args ) {
		$defaults = array(
			'must_query_fields'  => array( 'tag.name' ),
			'boost_query_fields' => array( 'tag.name' ),
			'prefixes'           => array( '#' ),
		);
		$args     = wp_parse_args( $args, $defaults );

		$tags = array();
		foreach ( $args['prefixes'] as $p ) {
			$found = $this->get_fields( $p );
			if ( $found ) {
				foreach ( $found as $f ) {
					$tags[] = $f;
				}
			}
		}

		if ( empty( $tags ) ) {
			return false;
		}

		foreach ( $args['prefixes'] as $p ) {
			$this->remove_fields( $p );
		}

		foreach ( $tags as $t ) {
			$is_phrase = false;
			if ( preg_match( '/"/', $t ) ) {
				$is_phrase = true;
				$t         = preg_replace( '/"/', '', $t );
			}

			if ( ! empty( $args['must_query_fields'] ) ) {
				if ( $is_phrase ) {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['must_query_fields'],
								'query'  => $t,
								'type'   => 'phrase',
							),
						)
					);
				} else {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['must_query_fields'],
								'query'  => $t,
							),
						)
					);
				}
			}

			if ( ! empty( $args['boost_query_fields'] ) ) {
				if ( $is_phrase ) {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['boost_query_fields'],
								'query'  => $t,
								'type'   => 'phrase',
							),
						),
						'should'
					);
				} else {
					$this->add_query(
						array(
							'multi_match' => array(
								'fields' => $args['boost_query_fields'],
								'query'  => $t,
							),
						),
						'should'
					);
				}
			}
		}

		return true;
	}

	/*
	 * Extract anything surrounded by quotes or if there is an opening quote
	 *   that is not complete, and add them to the query as a phrase query.
	 *   Quotes can be either '' or ""
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the phrases
	 *    boost_query_fields: array of fields to boost the phrases on (optional)
	 *
	 *  returns true/false of whether any were found
	 *
	 */
	public function phrase_filter( $args ) {
		$defaults = array(
			'must_query_fields'  => array( 'all_content' ),
			'boost_query_fields' => array( 'title' ),
		);
		$args     = wp_parse_args( $args, $defaults );

		$phrases = array();
		if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
			foreach ( $matches[1] as $match ) {
				$phrases[] = $match;
			}
			$this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
		}

		if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
			foreach ( $matches[1] as $match ) {
				$phrases[] = $match;
			}
			$this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
		}

		//look for a final, uncompleted phrase
		$phrase_prefix = false;
		if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
			$phrase_prefix       = $matches[1][0];
			$this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
		}
		if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) {
			$phrase_prefix       = $matches[1][0];
			$this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query );
		}

		if ( $phrase_prefix ) {
			$phrases[] = $phrase_prefix;
		}
		if ( empty( $phrases ) ) {
			return false;
		}

		foreach ( $phrases as $p ) {
			$this->add_query(
				array(
					'multi_match' => array(
						'fields' => $args['must_query_fields'],
						'query'  => $p,
						'type'   => 'phrase',
					),
				)
			);

			if ( ! empty( $args['boost_query_fields'] ) ) {
				$this->add_query(
					array(
						'multi_match' => array(
							'fields'   => $args['boost_query_fields'],
							'query'    => $p,
							'operator' => 'and',
						),
					),
					'should'
				);
			}
		}

		return true;
	}

	/*
	 * Query fields based on the remaining parts of the query
	 *   This could be the final AND part of the query terms to match, or it
	 *   could be boosting certain elements of the query
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
	 *
	 */
	public function remaining_query( $args ) {
		$defaults = array(
			'must_query_fields'  => null,
			'boost_query_fields' => null,
			'boost_operator'     => 'and',
			'boost_query_type'   => 'best_fields',
		);
		$args     = wp_parse_args( $args, $defaults );

		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
			return;
		}

		if ( ! empty( $args['must_query_fields'] ) ) {
			$this->add_query(
				array(
					'multi_match' => array(
						'fields'   => $args['must_query_fields'],
						'query'    => $this->current_query,
						'operator' => 'and',
					),
				)
			);
		}

		if ( ! empty( $args['boost_query_fields'] ) ) {
			$this->add_query(
				array(
					'multi_match' => array(
						'fields'   => $args['boost_query_fields'],
						'query'    => $this->current_query,
						'operator' => $args['boost_operator'],
						'type'     => $args['boost_query_type'],
					),
				),
				'should'
			);
		}

	}

	/*
	 * Query fields using a prefix query (alphabetical expansions on the index).
	 *   This is not recommended. Slower performance and worse relevancy.
	 *
	 *  (UNTESTED! Copied from old prefix expansion code)
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
	 *
	 */
	public function remaining_prefix_query( $args ) {
		$defaults = array(
			'must_query_fields'  => array( 'all_content' ),
			'boost_query_fields' => array( 'title' ),
			'boost_operator'     => 'and',
			'boost_query_type'   => 'best_fields',
		);
		$args     = wp_parse_args( $args, $defaults );

		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
			return;
		}

		//////////////////////////////////
		// Example cases to think about:
		// "elasticse"
		// "elasticsearch"
		// "elasticsearch "
		// "elasticsearch lucen"
		// "elasticsearch lucene"
		// "the future"  - note the stopword which will match nothing!
		// "F1" - an exact match that also has tons of expansions
		// "こんにちは" ja "hello"
		// "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
		//   - this could still be better I bet. Maybe we need to analyze with ES first?
		//

		/////////////////////////////
		//extract pieces of query
		// eg: "PREFIXREMAINDER PREFIXWORD"
		//     "elasticsearch lucen"

		$prefix_word      = false;
		$prefix_remainder = false;
		if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) {
			$prefix_word = $matches[1][0];
		}

		$prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
		if ( ctype_space( $prefix_remainder ) ) {
			$prefix_remainder = false;
		}

		if ( ! $prefix_word ) {
			//Space at the end of the query, so skip using a prefix query
			if ( ! empty( $args['must_query_fields'] ) ) {
				$this->add_query(
					array(
						'multi_match' => array(
							'fields'   => $args['must_query_fields'],
							'query'    => $this->current_query,
							'operator' => 'and',
						),
					)
				);
			}

			if ( ! empty( $args['boost_query_fields'] ) ) {
				$this->add_query(
					array(
						'multi_match' => array(
							'fields'   => $args['boost_query_fields'],
							'query'    => $this->current_query,
							'operator' => $args['boost_operator'],
							'type'     => $args['boost_query_type'],
						),
					),
					'should'
				);
			}
		} else {

			//must match the prefix word and the prefix remainder
			if ( ! empty( $args['must_query_fields'] ) ) {
				//need to do an OR across a few fields to handle all cases
				$must_q = array(
					'bool' => array(
						'should'               => array(),
						'minimum_should_match' => 1,
					),
				);

				//treat all words as an exact search (boosts complete word like "news"
				//from prefixes of "newspaper")
				$must_q['bool']['should'][] = array(
					'multi_match' => array(
						'fields'   => $this->all_fields,
						// NOTE: This line has been disabled since $full_text is not available.
						// 'query'    => $full_text,
						'operator' => 'and',
						'type'     => 'cross_fields',
					),
				);

				//always optimistically try and match the full text as a phrase
				//prefix "the futu" should try to match "the future"
				//otherwise the first stopword kinda breaks
				//This also works as the prefix match for a single word "elasticsea"
				$must_q['bool']['should'][] = array(
					'multi_match' => array(
						'fields'         => $this->phrase_fields,
						// NOTE: This line has been disabled since $full_text is not available.
						// 'query'          => $full_text,
						'operator'       => 'and',
						'type'           => 'phrase_prefix',
						'max_expansions' => 100,
					),
				);

				if ( $prefix_remainder ) {
					//Multiple words found, so treat each word on its own and not just as
					//a part of a phrase
					//"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
					$must_q['bool']['should'][] = array(
						'bool' => array(
							'must' => array(
								array(
									'multi_match' => array(
										'fields'         => $this->phrase_fields,
										'query'          => $prefix_word,
										'operator'       => 'and',
										'type'           => 'phrase_prefix',
										'max_expansions' => 100,
									),
								),
								array(
									'multi_match' => array(
										'fields'   => $this->all_fields,
										'query'    => $prefix_remainder,
										'operator' => 'and',
										'type'     => 'cross_fields',
									),
								),
							),
						),
					);
				}

				$this->add_query( $must_q );
			}

			//Now add any boosting of the query
			if ( ! empty( $args['boost_query_fields'] ) ) {
				//treat all words as an exact search (boosts complete word like "news"
				//from prefixes of "newspaper")
				$this->add_query(
					array(
						'multi_match' => array(
							'fields'   => $args['boost_query_fields'],
							'query'    => $this->current_query,
							'operator' => $args['boost_query_operator'],
							'type'     => $args['boost_query_type'],
						),
					),
					'should'
				);

				//optimistically boost the full phrase prefix match
				$this->add_query(
					array(
						'multi_match' => array(
							'fields'         => $args['boost_query_fields'],
							'query'          => $this->current_query,
							'operator'       => 'and',
							'type'           => 'phrase_prefix',
							'max_expansions' => 100,
						),
					)
				);
			}
		}
	}

	/*
	 * Boost results based on the lang probability overlaps
	 *
	 *  args:
	 *    langs2prob: list of languages to search in with associated boosts
	 */
	public function boost_lang_probs( $langs2prob ) {
		foreach ( $langs2prob as $p ) {
			$this->add_function(
				'field_value_factor',
				array(
					'modifier' => 'none',
					'factor'   => $p,
					'missing'  => 0.01, //1% chance doc did not have right lang detected
				)
			);
		}
	}

	////////////////////////////////////
	// Helper Methods

	//Get the text after some prefix. eg @gibrown, or @"Greg Brown"
	protected function get_fields( $field_prefix ) {
		$regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
		if ( preg_match_all( $regex, $this->current_query, $match ) ) {
			return $match[1];
		}
		return false;
	}

	//Remove the prefix and text from the query
	protected function remove_fields( $field_name ) {
		$regex               = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
		$this->current_query = preg_replace( $regex, '', $this->current_query );
	}

	//Best effort string truncation that splits on word breaks
	protected function truncate_string( $string, $limit, $break = ' ' ) {
		if ( mb_strwidth( $string ) <= $limit ) {
			return $string;
		}

		// walk backwards from $limit to find first break
		$breakpoint = $limit;
		$broken     = false;
		while ( $breakpoint > 0 ) {
			if ( mb_strimwidth( $string, $breakpoint, 1 ) === $break ) {
				$string = mb_strimwidth( $string, 0, $breakpoint );
				$broken = true;
				break;
			}
			$breakpoint--;
		}
		// if we weren't able to find a break, need to chop mid-word
		if ( ! $broken ) {
			$string = mb_strimwidth( $string, 0, $limit );
		}
		return $string;
	}

}