Skip to content

Commit

Permalink
feat(stdlib): add encode_charset, decode_charset functions (#1162)
Browse files Browse the repository at this point in the history
* feat(stdlib): add encode_charset, decode_charset functions

Charset encoding and decoding functions were implemented to integrate the charset settings used in various operating systems and systems with vector.

- decode_charset(value, from_charset)
- encode_charset(value, to_charset)

* test(stdlib): add encode/decode_charset to benches/stdlib.rs

* docs(stdlib): add changelog.d

* chore(stdlib): Modified the error function and its parameter name

- rename `error` function to `create_error`
- rename `from` to `from_charset` in decode_charset
- rename `from` to `to_charset` in encode_charset

---------

Co-authored-by: Jesse Szwedko <[email protected]>
  • Loading branch information
powerumc and jszwedko authored Dec 10, 2024
1 parent 821f0d0 commit ddcb49e
Show file tree
Hide file tree
Showing 7 changed files with 339 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ base62 = { version = "2.0.3", optional = true }
base64 = { version = "0.22", optional = true }
bytes = { version = "1", default-features = false, optional = true }
charset = { version = "0.1", optional = true }
encoding_rs = { version = "0.8.35", optional = false }
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"], optional = true }
chrono-tz = { version = "0.10", default-features = false, optional = true }
cidr-utils = { version = "0.6", optional = true }
Expand Down
22 changes: 22 additions & 0 deletions benches/stdlib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ criterion_group!(
contains,
decode_base16,
decode_base64,
decode_charset,
decode_percent,
decode_punycode,
decrypt,
Expand All @@ -35,6 +36,7 @@ criterion_group!(
downcase,
encode_base16,
encode_base64,
encode_charset,
encode_key_value,
encode_json,
encode_logfmt,
Expand Down Expand Up @@ -306,6 +308,16 @@ bench_function! {
}
}

bench_function! {
decode_charset => vrl::stdlib::DecodeCharset;

literal {
args: func_args![value: b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4",
from_charset: value!("euc-kr")],
want: Ok(value!("안녕하세요")),
}
}

bench_function! {
decode_percent => vrl::stdlib::DecodePercent;

Expand Down Expand Up @@ -380,6 +392,16 @@ bench_function! {
}
}

bench_function! {
encode_charset => vrl::stdlib::EncodeCharset;

literal {
args: func_args![value: value!("안녕하세요"),
to_charset: value!("euc-kr")],
want: Ok(value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4")),
}
}

bench_function! {
encode_key_value => vrl::stdlib::EncodeKeyValue;

Expand Down
1 change: 1 addition & 0 deletions changelog.d/1162.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add new `decode_charset`, `encode_charset` functions to decode and encode strings between different charsets.
156 changes: 156 additions & 0 deletions src/stdlib/decode_charset.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
use crate::diagnostic::Label;
use crate::prelude::*;
use encoding_rs::Encoding;
use nom::AsBytes;
use std::str::from_utf8;

#[derive(Clone, Copy, Debug)]
pub struct DecodeCharset;

impl Function for DecodeCharset {
fn identifier(&self) -> &'static str {
"decode_charset"
}

fn examples(&self) -> &'static [Example] {
&[
Example {
title: "Decode charset from euc-kr",
source: r#"decode_charset!(decode_base64!("vsiz58fPvLy/5A=="), "euc-kr")"#,
result: Ok("안녕하세요"),
},
Example {
title: "Decode charset from euc-jp",
source: r#"decode_charset!(decode_base64!("pLOk86TLpMGkzw=="), "euc-jp")"#,
result: Ok("こんにちは"),
},
Example {
title: "Decode charset from gb2312",
source: r#"decode_charset!(decode_base64!("xOO6ww=="), "gb2312")"#,
result: Ok("你好"),
},
]
}

fn summary(&self) -> &'static str {
"Decode non UTF-8 charset to UTF-8"
}

fn usage(&self) -> &'static str {
indoc! {"
Decode non UTF-8 charset to UTF-8.
The `value` parameter is a non UTF-8 encoded string.
The `from_charset` parameter specifies the charset of the `value`.
"}
}

fn parameters(&self) -> &'static [Parameter] {
&[
Parameter {
keyword: "value",
kind: kind::BYTES,
required: true,
},
Parameter {
keyword: "from_charset",
kind: kind::BYTES,
required: true,
},
]
}

fn compile(
&self,
_state: &TypeState,
_ctx: &mut FunctionCompileContext,
arguments: ArgumentList,
) -> Compiled {
let value = arguments.required("value");
let from_charset = arguments.required("from_charset");

Ok(DecodeCharsetFn {
value,
from_charset,
}
.as_expr())
}
}

fn decode_charset(value: &[u8], from_charset: &[u8]) -> Resolved {
let decoder = Encoding::for_label(from_charset).ok_or_else(|| create_error(from_charset))?;

let (output, _, _) = decoder.decode(value);
Ok(Value::Bytes(output.as_bytes().to_vec().into()))
}

fn create_error(from_charset: &[u8]) -> ExpressionError {
ExpressionError::Error {
message: format!(
"Unknown charset: {}",
from_utf8(from_charset).unwrap_or("unknown")
),
labels: vec![Label::primary("Unknown charset", Span::default())],
notes: vec![Note::SeeDocs(
"Encoding Living Standard".to_string(),
"https://encoding.spec.whatwg.org/".to_string(),
)],
}
}

#[derive(Debug, Clone)]
struct DecodeCharsetFn {
value: Box<dyn Expression>,
from_charset: Box<dyn Expression>,
}

impl FunctionExpression for DecodeCharsetFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?.try_bytes()?;
let from = self.from_charset.resolve(ctx)?.try_bytes()?;

decode_charset(value.as_bytes(), from.as_bytes())
}

fn type_def(&self, _state: &TypeState) -> TypeDef {
TypeDef::bytes().fallible()
}
}

#[cfg(test)]
mod test {
use super::*;
use crate::value;

test_function![
decode_charset => DecodeCharset;

decode_from_euc_kr {
args: func_args![value: b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4",
from_charset: value!("euc-kr")],
want: Ok(value!("안녕하세요")),
tdef: TypeDef::bytes().fallible(),
}

decode_from_euc_jp {
args: func_args![value: b"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf",
from_charset: value!("euc-jp")],
want: Ok(value!("こんにちは")),
tdef: TypeDef::bytes().fallible(),
}

decode_from_gb2312 {
args: func_args![value: b"\xc4\xe3\xba\xc3",
from_charset: value!("gb2312")],
want: Ok(value!("你好")),
tdef: TypeDef::bytes().fallible(),
}

unknown_charset {
args: func_args![value: value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"),
from_charset: value!(b"euc--kr")],
want: Err("Unknown charset: euc--kr"),
tdef: TypeDef::bytes().fallible(),
}
];
}
152 changes: 152 additions & 0 deletions src/stdlib/encode_charset.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
use crate::diagnostic::Label;
use crate::prelude::*;
use encoding_rs::Encoding;
use nom::AsBytes;
use std::str::from_utf8;

#[derive(Clone, Copy, Debug)]
pub struct EncodeCharset;

impl Function for EncodeCharset {
fn identifier(&self) -> &'static str {
"encode_charset"
}

fn examples(&self) -> &'static [Example] {
&[
Example {
title: "Encode charset to euc-kr",
source: r#"encode_base64(encode_charset!("안녕하세요", "euc-kr"))"#,
result: Ok("vsiz58fPvLy/5A=="),
},
Example {
title: "Encode charset to euc-jp",
source: r#"encode_base64(encode_charset!("こんにちは", "euc-jp"))"#,
result: Ok(r"pLOk86TLpMGkzw=="),
},
Example {
title: "Encode charset to gb2312",
source: r#"encode_base64(encode_charset!("你好", "gb2312"))"#,
result: Ok(r"xOO6ww=="),
},
]
}

fn summary(&self) -> &'static str {
"Encode UTF-8 to non UTF-8 charset"
}

fn usage(&self) -> &'static str {
indoc! {"
Encode UTF-8 to non UTF-8 charset.
The `value` parameter is a UTF-8 encoded string.
The `to_charset` parameter specifies the charset to encode the `value`.
"}
}

fn parameters(&self) -> &'static [Parameter] {
&[
Parameter {
keyword: "value",
kind: kind::BYTES,
required: true,
},
Parameter {
keyword: "to_charset",
kind: kind::BYTES,
required: true,
},
]
}

fn compile(
&self,
_state: &TypeState,
_ctx: &mut FunctionCompileContext,
arguments: ArgumentList,
) -> Compiled {
let value = arguments.required("value");
let to_charset = arguments.required("to_charset");

Ok(DecodeCharsetFn { value, to_charset }.as_expr())
}
}

fn encode_charset(value: &str, to_charset: &[u8]) -> Resolved {
let encoder = Encoding::for_label(to_charset).ok_or_else(|| create_error(to_charset))?;

let (output, _, _) = encoder.encode(value);
Ok(Value::Bytes(output.as_bytes().to_vec().into()))
}

fn create_error(to_charset: &[u8]) -> ExpressionError {
ExpressionError::Error {
message: format!(
"Unknown charset: {}",
from_utf8(to_charset).unwrap_or("unknown")
),
labels: vec![Label::primary("Unknown charset", Span::default())],
notes: vec![Note::SeeDocs(
"Encoding Living Standard".to_string(),
"https://encoding.spec.whatwg.org/".to_string(),
)],
}
}

#[derive(Debug, Clone)]
struct DecodeCharsetFn {
value: Box<dyn Expression>,
to_charset: Box<dyn Expression>,
}

impl FunctionExpression for DecodeCharsetFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?.try_bytes()?;
let to_charset = self.to_charset.resolve(ctx)?.try_bytes()?;

encode_charset(from_utf8(value.as_bytes()).unwrap(), to_charset.as_bytes())
}

fn type_def(&self, _state: &TypeState) -> TypeDef {
TypeDef::bytes().fallible()
}
}

#[cfg(test)]
mod test {
use super::*;
use crate::value;

test_function![
encode_charset => EncodeCharset;

encode_to_euc_kr {
args: func_args![value: value!("안녕하세요"),
to_charset: value!("euc-kr")],
want: Ok(value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4")),
tdef: TypeDef::bytes().fallible(),
}

encode_to_euc_jp {
args: func_args![value: value!("こんにちは"),
to_charset: value!("euc-jp")],
want: Ok(value!(b"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf")),
tdef: TypeDef::bytes().fallible(),
}

encode_to_gb2312 {
args: func_args![value: value!("你好"),
to_charset: value!("gb2312")],
want: Ok(value!(b"\xc4\xe3\xba\xc3")),
tdef: TypeDef::bytes().fallible(),
}

unknown_charset {
args: func_args![value: value!("안녕하세요"),
to_charset: value!("euc--kr")],
want: Err("Unknown charset: euc--kr"),
tdef: TypeDef::bytes().fallible(),
}
];
}
Loading

0 comments on commit ddcb49e

Please sign in to comment.