@@ -24,6 +24,18 @@ extern "C" {
2424#include < ctime>
2525#include < string>
2626
27+ namespace {
28+ constexpr uint8_t UTF8_CONTINUATION_BYTE_MASK = (1 << 7 ) | (1 << 6 );
29+ constexpr uint8_t UTF8_CONTINUATION_BYTE = (1 << 7 );
30+ constexpr uint8_t UTF8_MAX_SYMBOL_BYTES = 4 ;
31+
32+ // Returns true if byte is the starting byte of utf8
33+ // character, false if byte is the continuation (10xxxxxx).
34+ inline bool utf8_start_byte (uint8_t byte) {
35+ return (byte & UTF8_CONTINUATION_BYTE_MASK) != UTF8_CONTINUATION_BYTE;
36+ }
37+ } // namespace
38+
2739google::protobuf::Timestamp current_ts () {
2840 google::protobuf::Timestamp current_ts;
2941 struct timeval tv;
@@ -46,9 +58,26 @@ void set_segment_key(yagpcc::SegmentKey *key) {
4658 key->set_segindex (GpIdentity.segindex );
4759}
4860
49- inline std::string char_to_trimmed_str (const char *str, size_t len,
50- size_t lim) {
51- return std::string (str, std::min (len, lim));
61+ std::string trim_str_shrink_utf8 (const char *str, size_t len, size_t lim) {
62+ if (unlikely (str == nullptr )) {
63+ return std::string ();
64+ }
65+ if (likely (len <= lim || GetDatabaseEncoding () != PG_UTF8)) {
66+ return std::string (str, std::min (len, lim));
67+ }
68+
69+ // Handle trimming of utf8 correctly, do not cut multi-byte characters.
70+ size_t cut_pos = lim;
71+ size_t visited_bytes = 1 ;
72+ while (visited_bytes < UTF8_MAX_SYMBOL_BYTES && cut_pos > 0 ) {
73+ if (utf8_start_byte (static_cast <uint8_t >(str[cut_pos]))) {
74+ break ;
75+ }
76+ ++visited_bytes;
77+ --cut_pos;
78+ }
79+
80+ return std::string (str, cut_pos);
5281}
5382
5483void set_query_plan (yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
@@ -61,10 +90,10 @@ void set_query_plan(yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
6190 ya_gpdb::mem_ctx_switch_to (query_desc->estate ->es_query_cxt );
6291 ExplainState es = ya_gpdb::get_explain_state (query_desc, true );
6392 if (es.str ) {
64- *qi->mutable_plan_text () = char_to_trimmed_str (es.str ->data , es.str ->len ,
65- Config::max_plan_size ());
93+ *qi->mutable_plan_text () = trim_str_shrink_utf8 (es.str ->data , es.str ->len ,
94+ Config::max_plan_size ());
6695 StringInfo norm_plan = ya_gpdb::gen_normplan (es.str ->data );
67- *qi->mutable_template_plan_text () = char_to_trimmed_str (
96+ *qi->mutable_template_plan_text () = trim_str_shrink_utf8 (
6897 norm_plan->data , norm_plan->len , Config::max_plan_size ());
6998 qi->set_plan_id (
7099 hash_any ((unsigned char *)norm_plan->data , norm_plan->len ));
@@ -79,11 +108,11 @@ void set_query_plan(yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
79108void set_query_text (yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
80109 if (Gp_session_role == GP_ROLE_DISPATCH && query_desc->sourceText ) {
81110 auto qi = req->mutable_query_info ();
82- *qi->mutable_query_text () = char_to_trimmed_str (
111+ *qi->mutable_query_text () = trim_str_shrink_utf8 (
83112 query_desc->sourceText , strlen (query_desc->sourceText ),
84113 Config::max_text_size ());
85114 char *norm_query = ya_gpdb::gen_normquery (query_desc->sourceText );
86- *qi->mutable_template_query_text () = char_to_trimmed_str (
115+ *qi->mutable_template_query_text () = trim_str_shrink_utf8 (
87116 norm_query, strlen (norm_query), Config::max_text_size ());
88117 }
89118}
@@ -122,7 +151,7 @@ void set_qi_slice_id(yagpcc::SetQueryReq *req) {
122151void set_qi_error_message (yagpcc::SetQueryReq *req, const char *err_msg) {
123152 auto aqi = req->mutable_add_info ();
124153 *aqi->mutable_error_message () =
125- char_to_trimmed_str (err_msg, strlen (err_msg), Config::max_text_size ());
154+ trim_str_shrink_utf8 (err_msg, strlen (err_msg), Config::max_text_size ());
126155}
127156
128157void set_metric_instrumentation (yagpcc::MetricInstrumentation *metrics,
@@ -242,9 +271,9 @@ void set_analyze_plan_text(QueryDesc *query_desc, yagpcc::SetQueryReq *req) {
242271 if (es.str ->len > 0 && es.str ->data [es.str ->len - 1 ] == ' \n ' ) {
243272 es.str ->data [--es.str ->len ] = ' \0 ' ;
244273 }
245- auto trimmed_analyze =
246- char_to_trimmed_str (es. str -> data , es. str -> len , Config::max_plan_size ());
274+ auto trimmed_analyze = trim_str_shrink_utf8 (es. str -> data , es. str -> len ,
275+ Config::max_plan_size ());
247276 req->mutable_query_info ()->set_analyze_text (trimmed_analyze);
248277 ya_gpdb::pfree (es.str ->data );
249278 }
250- }
279+ }
0 commit comments