Skip to content

Commit ca620e9

Browse files
authored
Trim utf8 (#25)
Trim strings larger than 1MB by default and if we cut multi-byte utf8 then discard the character and shift the cut position.
1 parent 591e8a7 commit ca620e9

4 files changed

Lines changed: 157 additions & 19 deletions

File tree

expected/yagp_utf8_trim.out

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
CREATE EXTENSION IF NOT EXISTS yagp_hooks_collector;
2+
CREATE OR REPLACE FUNCTION get_marked_query(marker TEXT)
3+
RETURNS TEXT AS $$
4+
SELECT query_text
5+
FROM yagpcc.log
6+
WHERE query_text LIKE '%' || marker || '%'
7+
ORDER BY datetime DESC
8+
LIMIT 1
9+
$$ LANGUAGE sql VOLATILE;
10+
SET yagpcc.enable TO TRUE;
11+
-- Test 1: 1 byte chars
12+
SET yagpcc.max_text_size to 19;
13+
SET yagpcc.logging_mode to 'TBL';
14+
SELECT /*test1*/ 'HelloWorld';
15+
?column?
16+
------------
17+
HelloWorld
18+
(1 row)
19+
20+
RESET yagpcc.logging_mode;
21+
SELECT octet_length(get_marked_query('test1')) = 19 AS correct_length;
22+
correct_length
23+
----------------
24+
t
25+
(1 row)
26+
27+
-- Test 2: 2 byte chars
28+
SET yagpcc.max_text_size to 19;
29+
SET yagpcc.logging_mode to 'TBL';
30+
SELECT /*test2*/ 'РУССКИЙЯЗЫК';
31+
?column?
32+
-------------
33+
РУССКИЙЯЗЫК
34+
(1 row)
35+
36+
RESET yagpcc.logging_mode;
37+
-- Character 'Р' has two bytes and cut in the middle => not included.
38+
SELECT octet_length(get_marked_query('test2')) = 18 AS correct_length;
39+
correct_length
40+
----------------
41+
t
42+
(1 row)
43+
44+
-- Test 3: 4 byte chars
45+
SET yagpcc.max_text_size to 21;
46+
SET yagpcc.logging_mode to 'TBL';
47+
SELECT /*test3*/ '😀';
48+
?column?
49+
----------
50+
😀
51+
(1 row)
52+
53+
RESET yagpcc.logging_mode;
54+
-- Emoji has 4 bytes and cut before the last byte => not included.
55+
SELECT octet_length(get_marked_query('test3')) = 18 AS correct_length;
56+
correct_length
57+
----------------
58+
t
59+
(1 row)
60+
61+
-- Cleanup
62+
DROP FUNCTION get_marked_query(TEXT);
63+
RESET yagpcc.max_text_size;
64+
RESET yagpcc.logging_mode;
65+
RESET yagpcc.enable;
66+
DROP EXTENSION yagp_hooks_collector;

sql/yagp_utf8_trim.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
CREATE EXTENSION IF NOT EXISTS yagp_hooks_collector;
2+
3+
CREATE OR REPLACE FUNCTION get_marked_query(marker TEXT)
4+
RETURNS TEXT AS $$
5+
SELECT query_text
6+
FROM yagpcc.log
7+
WHERE query_text LIKE '%' || marker || '%'
8+
ORDER BY datetime DESC
9+
LIMIT 1
10+
$$ LANGUAGE sql VOLATILE;
11+
12+
SET yagpcc.enable TO TRUE;
13+
14+
-- Test 1: 1 byte chars
15+
SET yagpcc.max_text_size to 19;
16+
SET yagpcc.logging_mode to 'TBL';
17+
SELECT /*test1*/ 'HelloWorld';
18+
RESET yagpcc.logging_mode;
19+
SELECT octet_length(get_marked_query('test1')) = 19 AS correct_length;
20+
21+
-- Test 2: 2 byte chars
22+
SET yagpcc.max_text_size to 19;
23+
SET yagpcc.logging_mode to 'TBL';
24+
SELECT /*test2*/ 'РУССКИЙЯЗЫК';
25+
RESET yagpcc.logging_mode;
26+
-- Character 'Р' has two bytes and cut in the middle => not included.
27+
SELECT octet_length(get_marked_query('test2')) = 18 AS correct_length;
28+
29+
-- Test 3: 4 byte chars
30+
SET yagpcc.max_text_size to 21;
31+
SET yagpcc.logging_mode to 'TBL';
32+
SELECT /*test3*/ '😀';
33+
RESET yagpcc.logging_mode;
34+
-- Emoji has 4 bytes and cut before the last byte => not included.
35+
SELECT octet_length(get_marked_query('test3')) = 18 AS correct_length;
36+
37+
-- Cleanup
38+
DROP FUNCTION get_marked_query(TEXT);
39+
RESET yagpcc.max_text_size;
40+
RESET yagpcc.logging_mode;
41+
RESET yagpcc.enable;
42+
43+
DROP EXTENSION yagp_hooks_collector;

src/Config.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ static bool guc_enable_cdbstats = true;
1616
static bool guc_enable_collector = true;
1717
static bool guc_report_nested_queries = true;
1818
static char *guc_ignored_users = nullptr;
19-
static int guc_max_text_size = 1024; // in KB
20-
static int guc_max_plan_size = 1024; // in KB
21-
static int guc_min_analyze_time = 10000; // in ms
19+
static int guc_max_text_size = 1 << 20; // in bytes (1MB)
20+
static int guc_max_plan_size = 1024; // in KB
21+
static int guc_min_analyze_time = 10000; // in ms
2222
static int guc_logging_mode = LOG_MODE_UDS;
2323
static bool guc_enable_utility = false;
2424

@@ -99,9 +99,9 @@ void Config::init() {
9999

100100
DefineCustomIntVariable(
101101
"yagpcc.max_text_size",
102-
"Make yagpcc trim query texts longer than configured size", NULL,
103-
&guc_max_text_size, 1024, 0, INT_MAX / 1024, PGC_SUSET,
104-
GUC_NOT_IN_SAMPLE | GUC_GPDB_NEED_SYNC | GUC_UNIT_KB, NULL, NULL, NULL);
102+
"Make yagpcc trim query texts longer than configured size in bytes", NULL,
103+
&guc_max_text_size, 1 << 20 /* 1MB */, 0, INT_MAX, PGC_SUSET,
104+
GUC_NOT_IN_SAMPLE | GUC_GPDB_NEED_SYNC, NULL, NULL, NULL);
105105

106106
DefineCustomIntVariable(
107107
"yagpcc.max_plan_size",
@@ -134,7 +134,7 @@ bool Config::enable_cdbstats() { return guc_enable_cdbstats; }
134134
bool Config::enable_collector() { return guc_enable_collector; }
135135
bool Config::enable_utility() { return guc_enable_utility; }
136136
bool Config::report_nested_queries() { return guc_report_nested_queries; }
137-
size_t Config::max_text_size() { return guc_max_text_size * 1024; }
137+
size_t Config::max_text_size() { return guc_max_text_size; }
138138
size_t Config::max_plan_size() { return guc_max_plan_size * 1024; }
139139
int Config::min_analyze_time() { return guc_min_analyze_time; };
140140
int Config::logging_mode() { return guc_logging_mode; }

src/ProtoUtils.cpp

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@ extern "C" {
2424
#include <ctime>
2525
#include <string>
2626

27+
namespace {
28+
constexpr uint8_t UTF8_CONTINUATION_BYTE_MASK = (1 << 7) | (1 << 6);
29+
constexpr uint8_t UTF8_CONTINUATION_BYTE = (1 << 7);
30+
constexpr uint8_t UTF8_MAX_SYMBOL_BYTES = 4;
31+
32+
// Returns true if byte is the starting byte of utf8
33+
// character, false if byte is the continuation (10xxxxxx).
34+
inline bool utf8_start_byte(uint8_t byte) {
35+
return (byte & UTF8_CONTINUATION_BYTE_MASK) != UTF8_CONTINUATION_BYTE;
36+
}
37+
} // namespace
38+
2739
google::protobuf::Timestamp current_ts() {
2840
google::protobuf::Timestamp current_ts;
2941
struct timeval tv;
@@ -46,9 +58,26 @@ void set_segment_key(yagpcc::SegmentKey *key) {
4658
key->set_segindex(GpIdentity.segindex);
4759
}
4860

49-
inline std::string char_to_trimmed_str(const char *str, size_t len,
50-
size_t lim) {
51-
return std::string(str, std::min(len, lim));
61+
std::string trim_str_shrink_utf8(const char *str, size_t len, size_t lim) {
62+
if (unlikely(str == nullptr)) {
63+
return std::string();
64+
}
65+
if (likely(len <= lim || GetDatabaseEncoding() != PG_UTF8)) {
66+
return std::string(str, std::min(len, lim));
67+
}
68+
69+
// Handle trimming of utf8 correctly, do not cut multi-byte characters.
70+
size_t cut_pos = lim;
71+
size_t visited_bytes = 1;
72+
while (visited_bytes < UTF8_MAX_SYMBOL_BYTES && cut_pos > 0) {
73+
if (utf8_start_byte(static_cast<uint8_t>(str[cut_pos]))) {
74+
break;
75+
}
76+
++visited_bytes;
77+
--cut_pos;
78+
}
79+
80+
return std::string(str, cut_pos);
5281
}
5382

5483
void set_query_plan(yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
@@ -61,10 +90,10 @@ void set_query_plan(yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
6190
ya_gpdb::mem_ctx_switch_to(query_desc->estate->es_query_cxt);
6291
ExplainState es = ya_gpdb::get_explain_state(query_desc, true);
6392
if (es.str) {
64-
*qi->mutable_plan_text() = char_to_trimmed_str(es.str->data, es.str->len,
65-
Config::max_plan_size());
93+
*qi->mutable_plan_text() = trim_str_shrink_utf8(es.str->data, es.str->len,
94+
Config::max_plan_size());
6695
StringInfo norm_plan = ya_gpdb::gen_normplan(es.str->data);
67-
*qi->mutable_template_plan_text() = char_to_trimmed_str(
96+
*qi->mutable_template_plan_text() = trim_str_shrink_utf8(
6897
norm_plan->data, norm_plan->len, Config::max_plan_size());
6998
qi->set_plan_id(
7099
hash_any((unsigned char *)norm_plan->data, norm_plan->len));
@@ -79,11 +108,11 @@ void set_query_plan(yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
79108
void set_query_text(yagpcc::SetQueryReq *req, QueryDesc *query_desc) {
80109
if (Gp_session_role == GP_ROLE_DISPATCH && query_desc->sourceText) {
81110
auto qi = req->mutable_query_info();
82-
*qi->mutable_query_text() = char_to_trimmed_str(
111+
*qi->mutable_query_text() = trim_str_shrink_utf8(
83112
query_desc->sourceText, strlen(query_desc->sourceText),
84113
Config::max_text_size());
85114
char *norm_query = ya_gpdb::gen_normquery(query_desc->sourceText);
86-
*qi->mutable_template_query_text() = char_to_trimmed_str(
115+
*qi->mutable_template_query_text() = trim_str_shrink_utf8(
87116
norm_query, strlen(norm_query), Config::max_text_size());
88117
}
89118
}
@@ -122,7 +151,7 @@ void set_qi_slice_id(yagpcc::SetQueryReq *req) {
122151
void set_qi_error_message(yagpcc::SetQueryReq *req, const char *err_msg) {
123152
auto aqi = req->mutable_add_info();
124153
*aqi->mutable_error_message() =
125-
char_to_trimmed_str(err_msg, strlen(err_msg), Config::max_text_size());
154+
trim_str_shrink_utf8(err_msg, strlen(err_msg), Config::max_text_size());
126155
}
127156

128157
void set_metric_instrumentation(yagpcc::MetricInstrumentation *metrics,
@@ -242,9 +271,9 @@ void set_analyze_plan_text(QueryDesc *query_desc, yagpcc::SetQueryReq *req) {
242271
if (es.str->len > 0 && es.str->data[es.str->len - 1] == '\n') {
243272
es.str->data[--es.str->len] = '\0';
244273
}
245-
auto trimmed_analyze =
246-
char_to_trimmed_str(es.str->data, es.str->len, Config::max_plan_size());
274+
auto trimmed_analyze = trim_str_shrink_utf8(es.str->data, es.str->len,
275+
Config::max_plan_size());
247276
req->mutable_query_info()->set_analyze_text(trimmed_analyze);
248277
ya_gpdb::pfree(es.str->data);
249278
}
250-
}
279+
}

0 commit comments

Comments
 (0)