-
Notifications
You must be signed in to change notification settings - Fork 77
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(stdlib): add encode_charset, decode_charset functions (#1162)
* feat(stdlib): add encode_charset, decode_charset functions Charset encoding and decoding functions were implemented to integrate the charset settings used in various operating systems and systems with vector. - decode_charset(value, from_charset) - encode_charset(value, to_charset) * test(stdlib): add encode/decode_charset to benches/stdlib.rs * docs(stdlib): add changelog.d * chore(stdlib): Modified the error function and its parameter name - rename `error` function to `create_error` - rename `from` to `from_charset` in decode_charset - rename `from` to `to_charset` in encode_charset --------- Co-authored-by: Jesse Szwedko <[email protected]>
- Loading branch information
Showing
7 changed files
with
339 additions
and
0 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Add new `decode_charset`, `encode_charset` functions to decode and encode strings between different charsets. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
use crate::diagnostic::Label; | ||
use crate::prelude::*; | ||
use encoding_rs::Encoding; | ||
use nom::AsBytes; | ||
use std::str::from_utf8; | ||
|
||
#[derive(Clone, Copy, Debug)] | ||
pub struct DecodeCharset; | ||
|
||
impl Function for DecodeCharset { | ||
fn identifier(&self) -> &'static str { | ||
"decode_charset" | ||
} | ||
|
||
fn examples(&self) -> &'static [Example] { | ||
&[ | ||
Example { | ||
title: "Decode charset from euc-kr", | ||
source: r#"decode_charset!(decode_base64!("vsiz58fPvLy/5A=="), "euc-kr")"#, | ||
result: Ok("안녕하세요"), | ||
}, | ||
Example { | ||
title: "Decode charset from euc-jp", | ||
source: r#"decode_charset!(decode_base64!("pLOk86TLpMGkzw=="), "euc-jp")"#, | ||
result: Ok("こんにちは"), | ||
}, | ||
Example { | ||
title: "Decode charset from gb2312", | ||
source: r#"decode_charset!(decode_base64!("xOO6ww=="), "gb2312")"#, | ||
result: Ok("你好"), | ||
}, | ||
] | ||
} | ||
|
||
fn summary(&self) -> &'static str { | ||
"Decode non UTF-8 charset to UTF-8" | ||
} | ||
|
||
fn usage(&self) -> &'static str { | ||
indoc! {" | ||
Decode non UTF-8 charset to UTF-8. | ||
The `value` parameter is a non UTF-8 encoded string. | ||
The `from_charset` parameter specifies the charset of the `value`. | ||
"} | ||
} | ||
|
||
fn parameters(&self) -> &'static [Parameter] { | ||
&[ | ||
Parameter { | ||
keyword: "value", | ||
kind: kind::BYTES, | ||
required: true, | ||
}, | ||
Parameter { | ||
keyword: "from_charset", | ||
kind: kind::BYTES, | ||
required: true, | ||
}, | ||
] | ||
} | ||
|
||
fn compile( | ||
&self, | ||
_state: &TypeState, | ||
_ctx: &mut FunctionCompileContext, | ||
arguments: ArgumentList, | ||
) -> Compiled { | ||
let value = arguments.required("value"); | ||
let from_charset = arguments.required("from_charset"); | ||
|
||
Ok(DecodeCharsetFn { | ||
value, | ||
from_charset, | ||
} | ||
.as_expr()) | ||
} | ||
} | ||
|
||
fn decode_charset(value: &[u8], from_charset: &[u8]) -> Resolved { | ||
let decoder = Encoding::for_label(from_charset).ok_or_else(|| create_error(from_charset))?; | ||
|
||
let (output, _, _) = decoder.decode(value); | ||
Ok(Value::Bytes(output.as_bytes().to_vec().into())) | ||
} | ||
|
||
fn create_error(from_charset: &[u8]) -> ExpressionError { | ||
ExpressionError::Error { | ||
message: format!( | ||
"Unknown charset: {}", | ||
from_utf8(from_charset).unwrap_or("unknown") | ||
), | ||
labels: vec![Label::primary("Unknown charset", Span::default())], | ||
notes: vec![Note::SeeDocs( | ||
"Encoding Living Standard".to_string(), | ||
"https://encoding.spec.whatwg.org/".to_string(), | ||
)], | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
struct DecodeCharsetFn { | ||
value: Box<dyn Expression>, | ||
from_charset: Box<dyn Expression>, | ||
} | ||
|
||
impl FunctionExpression for DecodeCharsetFn { | ||
fn resolve(&self, ctx: &mut Context) -> Resolved { | ||
let value = self.value.resolve(ctx)?.try_bytes()?; | ||
let from = self.from_charset.resolve(ctx)?.try_bytes()?; | ||
|
||
decode_charset(value.as_bytes(), from.as_bytes()) | ||
} | ||
|
||
fn type_def(&self, _state: &TypeState) -> TypeDef { | ||
TypeDef::bytes().fallible() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
use crate::value; | ||
|
||
test_function![ | ||
decode_charset => DecodeCharset; | ||
|
||
decode_from_euc_kr { | ||
args: func_args![value: b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4", | ||
from_charset: value!("euc-kr")], | ||
want: Ok(value!("안녕하세요")), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
|
||
decode_from_euc_jp { | ||
args: func_args![value: b"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf", | ||
from_charset: value!("euc-jp")], | ||
want: Ok(value!("こんにちは")), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
|
||
decode_from_gb2312 { | ||
args: func_args![value: b"\xc4\xe3\xba\xc3", | ||
from_charset: value!("gb2312")], | ||
want: Ok(value!("你好")), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
|
||
unknown_charset { | ||
args: func_args![value: value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"), | ||
from_charset: value!(b"euc--kr")], | ||
want: Err("Unknown charset: euc--kr"), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
]; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
use crate::diagnostic::Label; | ||
use crate::prelude::*; | ||
use encoding_rs::Encoding; | ||
use nom::AsBytes; | ||
use std::str::from_utf8; | ||
|
||
#[derive(Clone, Copy, Debug)] | ||
pub struct EncodeCharset; | ||
|
||
impl Function for EncodeCharset { | ||
fn identifier(&self) -> &'static str { | ||
"encode_charset" | ||
} | ||
|
||
fn examples(&self) -> &'static [Example] { | ||
&[ | ||
Example { | ||
title: "Encode charset to euc-kr", | ||
source: r#"encode_base64(encode_charset!("안녕하세요", "euc-kr"))"#, | ||
result: Ok("vsiz58fPvLy/5A=="), | ||
}, | ||
Example { | ||
title: "Encode charset to euc-jp", | ||
source: r#"encode_base64(encode_charset!("こんにちは", "euc-jp"))"#, | ||
result: Ok(r"pLOk86TLpMGkzw=="), | ||
}, | ||
Example { | ||
title: "Encode charset to gb2312", | ||
source: r#"encode_base64(encode_charset!("你好", "gb2312"))"#, | ||
result: Ok(r"xOO6ww=="), | ||
}, | ||
] | ||
} | ||
|
||
fn summary(&self) -> &'static str { | ||
"Encode UTF-8 to non UTF-8 charset" | ||
} | ||
|
||
fn usage(&self) -> &'static str { | ||
indoc! {" | ||
Encode UTF-8 to non UTF-8 charset. | ||
The `value` parameter is a UTF-8 encoded string. | ||
The `to_charset` parameter specifies the charset to encode the `value`. | ||
"} | ||
} | ||
|
||
fn parameters(&self) -> &'static [Parameter] { | ||
&[ | ||
Parameter { | ||
keyword: "value", | ||
kind: kind::BYTES, | ||
required: true, | ||
}, | ||
Parameter { | ||
keyword: "to_charset", | ||
kind: kind::BYTES, | ||
required: true, | ||
}, | ||
] | ||
} | ||
|
||
fn compile( | ||
&self, | ||
_state: &TypeState, | ||
_ctx: &mut FunctionCompileContext, | ||
arguments: ArgumentList, | ||
) -> Compiled { | ||
let value = arguments.required("value"); | ||
let to_charset = arguments.required("to_charset"); | ||
|
||
Ok(DecodeCharsetFn { value, to_charset }.as_expr()) | ||
} | ||
} | ||
|
||
fn encode_charset(value: &str, to_charset: &[u8]) -> Resolved { | ||
let encoder = Encoding::for_label(to_charset).ok_or_else(|| create_error(to_charset))?; | ||
|
||
let (output, _, _) = encoder.encode(value); | ||
Ok(Value::Bytes(output.as_bytes().to_vec().into())) | ||
} | ||
|
||
fn create_error(to_charset: &[u8]) -> ExpressionError { | ||
ExpressionError::Error { | ||
message: format!( | ||
"Unknown charset: {}", | ||
from_utf8(to_charset).unwrap_or("unknown") | ||
), | ||
labels: vec![Label::primary("Unknown charset", Span::default())], | ||
notes: vec![Note::SeeDocs( | ||
"Encoding Living Standard".to_string(), | ||
"https://encoding.spec.whatwg.org/".to_string(), | ||
)], | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
struct DecodeCharsetFn { | ||
value: Box<dyn Expression>, | ||
to_charset: Box<dyn Expression>, | ||
} | ||
|
||
impl FunctionExpression for DecodeCharsetFn { | ||
fn resolve(&self, ctx: &mut Context) -> Resolved { | ||
let value = self.value.resolve(ctx)?.try_bytes()?; | ||
let to_charset = self.to_charset.resolve(ctx)?.try_bytes()?; | ||
|
||
encode_charset(from_utf8(value.as_bytes()).unwrap(), to_charset.as_bytes()) | ||
} | ||
|
||
fn type_def(&self, _state: &TypeState) -> TypeDef { | ||
TypeDef::bytes().fallible() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
use crate::value; | ||
|
||
test_function![ | ||
encode_charset => EncodeCharset; | ||
|
||
encode_to_euc_kr { | ||
args: func_args![value: value!("안녕하세요"), | ||
to_charset: value!("euc-kr")], | ||
want: Ok(value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4")), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
|
||
encode_to_euc_jp { | ||
args: func_args![value: value!("こんにちは"), | ||
to_charset: value!("euc-jp")], | ||
want: Ok(value!(b"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf")), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
|
||
encode_to_gb2312 { | ||
args: func_args![value: value!("你好"), | ||
to_charset: value!("gb2312")], | ||
want: Ok(value!(b"\xc4\xe3\xba\xc3")), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
|
||
unknown_charset { | ||
args: func_args![value: value!("안녕하세요"), | ||
to_charset: value!("euc--kr")], | ||
want: Err("Unknown charset: euc--kr"), | ||
tdef: TypeDef::bytes().fallible(), | ||
} | ||
]; | ||
} |
Oops, something went wrong.