diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 32cbdafd761..28ba1aa7eb2 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -365,6 +365,79 @@ Example response: [HTTP accept header]: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html + +### `_field_caps`   Field capabilities API + +``` +GET api/v1/_elastic//_field_caps +``` +``` +POST api/v1/_elastic//_field_caps +``` +``` +GET api/v1/_elastic/_field_caps +``` +``` +POST api/v1/_elastic/_field_caps +``` + +The [field capabilities API](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-field-caps.html) returns information about the capabilities of fields among multiple indices. + +#### Supported Query string parameters + +| Variable | Type | Description | Default value | +| --------------------- | ---------- | ------------------------------------------------------------------------------ | ------------- | +| `fields` | `String` | Comma-separated list of fields to retrieve capabilities for. Supports wildcards (`*`). | (Optional) | +| `allow_no_indices` | `Boolean` | If `true`, missing or closed indices are not an error. | (Optional) | +| `expand_wildcards` | `String` | Controls what kind of indices that wildcard patterns can match. | (Optional) | +| `ignore_unavailable` | `Boolean` | If `true`, unavailable indices are ignored. | (Optional) | +| `start_timestamp` | `Integer` | *(Quickwit-specific)* If set, restricts splits to documents with a timestamp range start >= `start_timestamp` (seconds since epoch). | (Optional) | +| `end_timestamp` | `Integer` | *(Quickwit-specific)* If set, restricts splits to documents with a timestamp range end < `end_timestamp` (seconds since epoch). | (Optional) | + +#### Supported Request Body parameters + +| Variable | Type | Description | Default value | +| ------------------ | ------------- | --------------------------------------------------------------------------- | ------------- | +| `index_filter` | `Json object` | A query to filter indices. If provided, only fields from indices that can potentially match the filter are returned. See [index_filter](#index_filter). | (Optional) | +| `runtime_mappings` | `Json object` | Accepted but not supported. | (Optional) | + +#### `index_filter` + +The `index_filter` parameter allows you to filter which indices contribute to the field capabilities response. When provided, Quickwit uses the filter query to prune indices (splits) that cannot match the filter, and only returns field capabilities for the remaining ones. + +Like Elasticsearch, this is a **best-effort** approach: Quickwit may return field capabilities from indices that do not actually contain any matching documents. In Quickwit, the filtering is limited to the existing split-pruning based on metadata: + +- **Time pruning**: Range queries on the timestamp field can eliminate splits whose time range does not overlap with the filter. +- **Tag pruning**: Term queries on [tag fields](../configuration/index-config.md#tag-fields) can eliminate splits that do not contain the requested tag value. + +Other filter types (e.g. full-text queries or term queries on non-tag fields) are accepted but will not prune any splits — all indices will be returned as if no filter was specified. In particular, Quickwit does not check whether terms are present in the term dictionary. + +#### Request Body example + +```json +{ + "index_filter": { + "range": { + "timestamp": { + "gte": "2024-01-01T00:00:00Z", + "lt": "2024-02-01T00:00:00Z" + } + } + } +} +``` + +```json +{ + "index_filter": { + "term": { + "status": "active" + } + } +} +``` + + ## Query DSL [Elasticsearch Query DSL reference](https://www.elastic.co/guide/en/elasticsearch/reference/8.8/query-dsl.html). diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index ae3442fe1aa..a49b951b3ca 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -125,6 +125,10 @@ message ListFieldsRequest { optional int64 start_timestamp = 3; optional int64 end_timestamp = 4; + // JSON-serialized QueryAst for index_filter support. + // When provided, only fields from documents matching this query are returned. + optional string query_ast = 5; + // Control if the request will fail if split_ids contains a split that does not exist. // optional bool fail_on_missing_index = 6; } @@ -141,7 +145,6 @@ message LeafListFieldsRequest { // Optional limit query to a list of fields // Wildcard expressions are supported. repeated string fields = 4; - } message ListFieldsResponse { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 1e933055cd3..16c11358ab8 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -70,6 +70,10 @@ pub struct ListFieldsRequest { pub start_timestamp: ::core::option::Option, #[prost(int64, optional, tag = "4")] pub end_timestamp: ::core::option::Option, + /// JSON-serialized QueryAst for index_filter support. + /// When provided, only fields from documents matching this query are returned. + #[prost(string, optional, tag = "5")] + pub query_ast: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/quickwit/quickwit-search/src/list_fields.rs b/quickwit/quickwit-search/src/list_fields.rs index f4cf173fe08..b5974867cfd 100644 --- a/quickwit/quickwit-search/src/list_fields.rs +++ b/quickwit/quickwit-search/src/list_fields.rs @@ -24,6 +24,8 @@ use itertools::Itertools; use quickwit_common::rate_limited_warn; use quickwit_common::shared_consts::{FIELD_PRESENCE_FIELD_NAME, SPLIT_FIELDS_FILE_NAME}; use quickwit_common::uri::Uri; +use quickwit_config::build_doc_mapper; +use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_metastore::SplitMetadata; use quickwit_proto::metastore::MetastoreServiceClient; use quickwit_proto::search::{ @@ -31,6 +33,7 @@ use quickwit_proto::search::{ ListFieldsResponse, SplitIdAndFooterOffsets, deserialize_split_fields, }; use quickwit_proto::types::{IndexId, IndexUid}; +use quickwit_query::query_ast::QueryAst; use quickwit_storage::Storage; use crate::leaf::open_split_bundle; @@ -310,6 +313,8 @@ impl FieldPattern { } /// `leaf` step of list fields. +/// +/// Returns field metadata from the assigned splits. pub async fn leaf_list_fields( index_id: IndexId, index_storage: Arc, @@ -322,6 +327,12 @@ pub async fn leaf_list_fields( .map(|pattern_str| FieldPattern::from_str(pattern_str)) .collect::>()?; + // If no splits, return empty response + if split_ids.is_empty() { + return Ok(ListFieldsResponse { fields: Vec::new() }); + } + + // Get fields from all splits let single_split_list_fields_futures: Vec<_> = split_ids .iter() .map(|split_id| { @@ -375,7 +386,7 @@ pub async fn leaf_list_fields( Ok(ListFieldsResponse { fields }) } -/// Index metas needed for executing a leaf search request. +/// Index metas needed for executing a leaf list fields request. #[derive(Clone, Debug)] pub struct IndexMetasForLeafSearch { /// Index id. @@ -399,29 +410,63 @@ pub async fn root_list_fields( if indexes_metadata.is_empty() { return Ok(ListFieldsResponse { fields: Vec::new() }); } - let index_uid_to_index_meta: HashMap = indexes_metadata - .iter() - .map(|index_metadata| { - let index_metadata_for_leaf_search = IndexMetasForLeafSearch { - index_uri: index_metadata.index_uri().clone(), - index_id: index_metadata.index_config.index_id.to_string(), - }; - - ( - index_metadata.index_uid.clone(), - index_metadata_for_leaf_search, + + // Build index metadata map and extract timestamp field for time range refinement + let mut index_uid_to_index_meta: HashMap = HashMap::new(); + let mut index_uids: Vec = Vec::new(); + let mut timestamp_field_opt: Option = None; + + for index_metadata in indexes_metadata { + // Extract timestamp field for time range refinement (use first index's field) + if timestamp_field_opt.is_none() + && list_fields_req.query_ast.is_some() + && let Ok(doc_mapper) = build_doc_mapper( + &index_metadata.index_config.doc_mapping, + &index_metadata.index_config.search_settings, ) - }) - .collect(); - let index_uids: Vec = indexes_metadata - .into_iter() - .map(|index_metadata| index_metadata.index_uid) - .collect(); + { + timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string()); + } + + let index_metadata_for_leaf_search = IndexMetasForLeafSearch { + index_uri: index_metadata.index_uri().clone(), + index_id: index_metadata.index_config.index_id.to_string(), + }; + + index_uids.push(index_metadata.index_uid.clone()); + index_uid_to_index_meta.insert( + index_metadata.index_uid.clone(), + index_metadata_for_leaf_search, + ); + } + + // Extract tags and refine time range from query_ast for split pruning + let mut start_timestamp = list_fields_req.start_timestamp; + let mut end_timestamp = list_fields_req.end_timestamp; + let tags_filter_opt = if let Some(ref query_ast_json) = list_fields_req.query_ast { + let query_ast: QueryAst = serde_json::from_str(query_ast_json) + .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; + + // Refine time range from query AST if timestamp field is available + if let Some(ref timestamp_field) = timestamp_field_opt { + crate::root::refine_start_end_timestamp_from_ast( + &query_ast, + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ); + } + + extract_tags_from_query(query_ast) + } else { + None + }; + let split_metadatas: Vec = list_relevant_splits( index_uids, - list_fields_req.start_timestamp, - list_fields_req.end_timestamp, - None, + start_timestamp, + end_timestamp, + tags_filter_opt, &mut metastore, ) .await?; diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs index a382c541dc7..9aefdc83762 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs @@ -15,7 +15,10 @@ use std::collections::HashMap; use quickwit_proto::search::{ListFieldType, ListFieldsEntryResponse, ListFieldsResponse}; +use quickwit_query::ElasticQueryDsl; +use quickwit_query::query_ast::QueryAst; use serde::{Deserialize, Serialize}; +use warp::hyper::StatusCode; use super::ElasticsearchError; use super::search_query_params::*; @@ -173,16 +176,227 @@ pub fn convert_to_es_field_capabilities_response( FieldCapabilityResponse { indices, fields } } +/// Parses an Elasticsearch index_filter JSON value into a Quickwit QueryAst. +/// +/// Returns `Ok(None)` if the index_filter is null. +/// Returns `Ok(Some(QueryAst))` if the index_filter is valid. +/// Returns `Err` if the index_filter is invalid or cannot be converted (including empty object). +#[allow(clippy::result_large_err)] +pub fn parse_index_filter_to_query_ast( + index_filter: serde_json::Value, +) -> Result, ElasticsearchError> { + if index_filter.is_null() { + return Ok(None); + } + + // Parse ES Query DSL to internal QueryAst + let elastic_query_dsl: ElasticQueryDsl = + serde_json::from_value(index_filter).map_err(|err| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Invalid index_filter: {err}"), + None, + ) + })?; + + let query_ast: QueryAst = elastic_query_dsl.try_into().map_err(|err: anyhow::Error| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Failed to convert index_filter: {err}"), + None, + ) + })?; + + Ok(Some(query_ast)) +} + #[allow(clippy::result_large_err)] pub fn build_list_field_request_for_es_api( index_id_patterns: Vec, search_params: FieldCapabilityQueryParams, - _search_body: FieldCapabilityRequestBody, + search_body: FieldCapabilityRequestBody, ) -> Result { + let query_ast = parse_index_filter_to_query_ast(search_body.index_filter)?; + let query_ast_json = query_ast + .map(|ast| serde_json::to_string(&ast).expect("QueryAst should be JSON serializable")); + Ok(quickwit_proto::search::ListFieldsRequest { index_id_patterns, fields: search_params.fields.unwrap_or_default(), start_timestamp: search_params.start_timestamp, end_timestamp: search_params.end_timestamp, + query_ast: query_ast_json, }) } + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::*; + + #[test] + fn test_build_list_field_request_empty_index_filter() { + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + FieldCapabilityRequestBody::default(), + ) + .unwrap(); + + assert_eq!(result.index_id_patterns, vec!["test_index".to_string()]); + assert!(result.query_ast.is_none()); + } + + #[test] + fn test_build_list_field_request_with_term_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "term": { + "status": "active" + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert_eq!(result.index_id_patterns, vec!["test_index".to_string()]); + assert!(result.query_ast.is_some()); + + // Verify the query_ast is valid JSON + let query_ast: serde_json::Value = + serde_json::from_str(&result.query_ast.unwrap()).unwrap(); + assert!(query_ast.is_object()); + } + + #[test] + fn test_build_list_field_request_with_bool_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "bool": { + "must": [ + { "term": { "status": "active" } } + ], + "filter": [ + { "range": { "age": { "gte": 18 } } } + ] + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert!(result.query_ast.is_some()); + } + + #[test] + fn test_build_list_field_request_with_invalid_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "invalid_query_type": { + "field": "value" + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.status, StatusCode::BAD_REQUEST); + } + + #[test] + fn test_build_list_field_request_with_null_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: serde_json::Value::Null, + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert!(result.query_ast.is_none()); + } + + #[test] + fn test_build_list_field_request_preserves_other_params() { + let search_params = FieldCapabilityQueryParams { + fields: Some(vec!["field1".to_string(), "field2".to_string()]), + start_timestamp: Some(1000), + end_timestamp: Some(2000), + ..Default::default() + }; + + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ "match_all": {} }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + search_params, + search_body, + ) + .unwrap(); + + assert_eq!( + result.fields, + vec!["field1".to_string(), "field2".to_string()] + ); + assert_eq!(result.start_timestamp, Some(1000)); + assert_eq!(result.end_timestamp, Some(2000)); + assert!(result.query_ast.is_some()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_null() { + let result = parse_index_filter_to_query_ast(serde_json::Value::Null).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_empty_object() { + // Empty object {} should return error to match ES behavior + let result = parse_index_filter_to_query_ast(json!({})); + assert!(result.is_err()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_valid_term() { + let result = parse_index_filter_to_query_ast(json!({ + "term": { "status": "active" } + })) + .unwrap(); + assert!(result.is_some()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_invalid() { + let result = parse_index_filter_to_query_ast(json!({ + "invalid_query_type": { "field": "value" } + })); + assert!(result.is_err()); + } +} diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs index 70c21dcd6df..9abce6c61e6 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs @@ -200,6 +200,7 @@ async fn es_compat_index_mapping( fields: Vec::new(), start_timestamp: None, end_timestamp: None, + query_ast: None, }; let list_fields_response = search_service .root_list_fields(list_fields_request) diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml index bd3cd917acd..a3c5041926d 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml @@ -31,10 +31,10 @@ expected: searchable: true aggregatable: true mixed: # This is a little weird case (values [5, -5.5]), since coercion happens only on the columnar side. That's why `long` is not aggregatable. - long: + long: metadata_field: false searchable: true - aggregatable: false + aggregatable: false double: metadata_field: false searchable: true @@ -88,10 +88,10 @@ expected: fields: $expect: "not 'id' in val" # Filtered by start_timestamp mixed: # This is a little weird case (values [5, -5.5]), since coercion happens only on the columnar side. That's why `long` is not aggregatable. - long: + long: metadata_field: false searchable: true - aggregatable: false + aggregatable: false double: metadata_field: false searchable: true @@ -103,8 +103,6 @@ expected: aggregatable: true --- # Test fields parameter with `.dynamic` suffix -engines: - - quickwit method: [GET] engines: - quickwit @@ -193,9 +191,6 @@ expected: --- # Compare with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps/_field_caps?fields=nested.*ponse expected: indices: @@ -210,9 +205,6 @@ expected: --- # Compare ip field with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps*/_field_caps?fields=host expected: indices: @@ -295,9 +287,6 @@ expected: --- # Wildcard on index name + Wildcard without match method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldca*,blub*/_field_caps?fields=date expected: indices: @@ -313,24 +302,140 @@ expected: --- # Exact match index + Non matching exact index method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps,blub/_field_caps?fields=date status_code: 404 --- # Compare ip field with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: doesnotexist/_field_caps?fields=date status_code: 404 --- # Compare ip field with elastic search method: [GET] +endpoint: doesno*texist/_field_caps?fields=date +status_code: 200 +--- +# Test _field_caps API with index_filter (term query) +# Note: term queries require exact token match; 'fritz' is lowercase due to default tokenizer +method: [POST] +endpoint: fieldcaps/_field_caps?fields=* +json: + index_filter: + term: + name: "fritz" +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with index_filter (match_all query) +method: [POST] +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: + match_all: {} +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with index_filter (bool query) +method: [POST] +endpoint: fieldcaps/_field_caps?fields=response,name +json: + index_filter: + bool: + must: + - term: + name: "fritz" + filter: + - range: + response: + gte: 30 +expected: + indices: + - fieldcaps + fields: + response: + long: + type: long + metadata_field: false + searchable: true + aggregatable: true + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with invalid index_filter +method: [POST] +endpoint: fieldcaps/_field_caps?fields=* +json: + index_filter: + invalid_query_type: + field: "value" +status_code: 400 +--- +# Test _field_caps API with empty index_filter (should return 400 like ES) +method: [POST] engines: - quickwit - elasticsearch -endpoint: doesno*texist/_field_caps?fields=date -status_code: 200 +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: {} +status_code: 400 +--- +# Test _field_caps API with index_filter using tag field for split pruning (QW-only) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: + term: + tags: "nice" +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml index 8b02ee01882..5576e6cec28 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml @@ -22,6 +22,7 @@ json: tokenizer: default fast: true timestamp_field: date + tag_fields: ["tags"] field_mappings: - name: date type: datetime @@ -32,6 +33,10 @@ json: - name: host type: ip fast: true + - name: tags + type: array + tokenizer: raw + fast: true --- # Create index method: POST