From 7f010fc8b9b2b73b40ab77457e10edcfa7caa383 Mon Sep 17 00:00:00 2001 From: Semyon Uchvatov Date: Thu, 2 Jan 2025 19:24:01 +0300 Subject: [PATCH] feat(parse_cbor): add parse_cbor function (#1152) * Add cbor support * Move json_type_def to separate module --------- Co-authored-by: Semyon Uchvatov --- Cargo.lock | 1 + Cargo.toml | 2 + LICENSE-3rdparty.csv | 3 + changelog.d/1152.feature.md | 1 + license-tool.toml | 2 + src/stdlib/json_utils/json_type_def.rs | 24 +++++ src/stdlib/json_utils/mod.rs | 1 + src/stdlib/mod.rs | 4 + src/stdlib/parse_cbor.rs | 132 +++++++++++++++++++++++++ src/stdlib/parse_json.rs | 68 ++++--------- src/stdlib/parse_proto.rs | 28 +----- tests/data/cbor/complex.cbor | 1 + tests/data/cbor/simple.cbor | 1 + 13 files changed, 194 insertions(+), 74 deletions(-) create mode 100644 changelog.d/1152.feature.md create mode 100644 license-tool.toml create mode 100644 src/stdlib/json_utils/json_type_def.rs create mode 100644 src/stdlib/json_utils/mod.rs create mode 100644 src/stdlib/parse_cbor.rs create mode 100644 tests/data/cbor/complex.cbor create mode 100644 tests/data/cbor/simple.cbor diff --git a/Cargo.lock b/Cargo.lock index 143bd18c90..cce4f2e3e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3612,6 +3612,7 @@ dependencies = [ "charset", "chrono", "chrono-tz", + "ciborium", "cidr-utils", "clap", "codespan-reporting", diff --git a/Cargo.toml b/Cargo.toml index 2db1dc58f3..3c879d6d59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,7 @@ stdlib = [ "dep:chacha20poly1305", "dep:charset", "dep:convert_case", + "dep:ciborium", "dep:cidr-utils", "dep:community-id", "dep:crc", @@ -131,6 +132,7 @@ charset = { version = "0.1", optional = true } encoding_rs = { version = "0.8.35", optional = false } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"], optional = true } chrono-tz = { version = "0.10", default-features = false, optional = true } +ciborium = { version = "0.2.2", default-features = false, optional = true } cidr-utils = { version = "0.6", optional = true } csv = { version = "1", optional = true } clap = { version = "4", features = ["derive"], optional = true } diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 48f3e49c7a..849b036c03 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -44,6 +44,7 @@ chacha20poly1305,https://github.com/RustCrypto/AEADs/tree/master/chacha20poly130 charset,https://github.com/hsivonen/charset,Apache-2.0 OR MIT,Henri Sivonen chrono,https://github.com/chronotope/chrono,MIT OR Apache-2.0,The chrono Authors chrono-tz,https://github.com/chronotope/chrono-tz,MIT OR Apache-2.0,The chrono-tz Authors +ciborium,https://github.com/enarx/ciborium,Apache-2.0,Nathaniel McCallum cidr,https://github.com/stbuehler/rust-cidr,MIT,Stefan Bühler cidr-utils,https://github.com/magiclen/cidr-utils,MIT,Magic Len cipher,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers @@ -68,6 +69,7 @@ crc32fast,https://github.com/srijs/rust-crc32fast,MIT OR Apache-2.0,"Sam Rijs crypto-common,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers crypto_secretbox,https://github.com/RustCrypto/nacl-compat/tree/master/crypto_secretbox,Apache-2.0 OR MIT,RustCrypto Developers csv,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant @@ -103,6 +105,7 @@ generic-array,https://github.com/fizyk20/generic-array,MIT,"BartÅ‚omiej KamiÅ„sk getrandom,https://github.com/rust-random/getrandom,MIT OR Apache-2.0,The Rand Project Developers gimli,https://github.com/gimli-rs/gimli,MIT OR Apache-2.0,The gimli Authors grok,https://github.com/daschl/grok,Apache-2.0,Michael Nitschinger +half,https://github.com/starkat99/half-rs,MIT OR Apache-2.0,Kathryn Long hashbrown,https://github.com/rust-lang/hashbrown,MIT OR Apache-2.0,Amanieu d'Antras heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,The heck Authors heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,Without Boats diff --git a/changelog.d/1152.feature.md b/changelog.d/1152.feature.md new file mode 100644 index 0000000000..42ca00d855 --- /dev/null +++ b/changelog.d/1152.feature.md @@ -0,0 +1 @@ +Add `parse_cbor` function diff --git a/license-tool.toml b/license-tool.toml new file mode 100644 index 0000000000..bab03387d5 --- /dev/null +++ b/license-tool.toml @@ -0,0 +1,2 @@ +[overrides] +"crunchy" = { origin = "https://github.com/eira-fransham/crunchy" } diff --git a/src/stdlib/json_utils/json_type_def.rs b/src/stdlib/json_utils/json_type_def.rs new file mode 100644 index 0000000000..77ffa5cd4e --- /dev/null +++ b/src/stdlib/json_utils/json_type_def.rs @@ -0,0 +1,24 @@ +use crate::prelude::{Collection, TypeDef}; +use crate::value::Kind; + +pub(crate) fn json_inner_kind() -> Kind { + Kind::null() + | Kind::bytes() + | Kind::integer() + | Kind::float() + | Kind::boolean() + | Kind::array(Collection::any()) + | Kind::object(Collection::any()) +} + +pub(crate) fn json_type_def() -> TypeDef { + TypeDef::bytes() + .fallible() + .or_boolean() + .or_integer() + .or_float() + .add_null() + .or_null() + .or_array(Collection::from_unknown(json_inner_kind())) + .or_object(Collection::from_unknown(json_inner_kind())) +} diff --git a/src/stdlib/json_utils/mod.rs b/src/stdlib/json_utils/mod.rs new file mode 100644 index 0000000000..472564627e --- /dev/null +++ b/src/stdlib/json_utils/mod.rs @@ -0,0 +1 @@ +pub(crate) mod json_type_def; diff --git a/src/stdlib/mod.rs b/src/stdlib/mod.rs index 3df2151944..f6e91d2caf 100644 --- a/src/stdlib/mod.rs +++ b/src/stdlib/mod.rs @@ -31,6 +31,7 @@ pub use wasm_unsupported_function::WasmUnsupportedFunction; use crate::compiler::Function; +mod json_utils; mod string_utils; mod util; mod wasm_unsupported_function; @@ -141,6 +142,7 @@ cfg_if::cfg_if! { mod parse_aws_cloudwatch_log_subscription_message; mod parse_aws_vpc_flow_log; mod parse_cef; + mod parse_cbor; mod parse_common_log; mod parse_csv; mod parse_duration; @@ -324,6 +326,7 @@ cfg_if::cfg_if! { pub use parse_aws_alb_log::ParseAwsAlbLog; pub use parse_aws_cloudwatch_log_subscription_message::ParseAwsCloudWatchLogSubscriptionMessage; pub use parse_aws_vpc_flow_log::ParseAwsVpcFlowLog; + pub use parse_cbor::ParseCbor; pub use parse_cef::ParseCef; pub use parse_common_log::ParseCommonLog; pub use parse_csv::ParseCsv; @@ -514,6 +517,7 @@ pub fn all() -> Vec> { Box::new(ParseAwsAlbLog), Box::new(ParseAwsCloudWatchLogSubscriptionMessage), Box::new(ParseAwsVpcFlowLog), + Box::new(ParseCbor), Box::new(ParseCef), Box::new(ParseCommonLog), Box::new(ParseCsv), diff --git a/src/stdlib/parse_cbor.rs b/src/stdlib/parse_cbor.rs new file mode 100644 index 0000000000..c384d776cb --- /dev/null +++ b/src/stdlib/parse_cbor.rs @@ -0,0 +1,132 @@ +use crate::compiler::prelude::*; +use crate::stdlib::json_utils::json_type_def::json_type_def; +use ciborium::de::from_reader; +use zstd::zstd_safe::WriteBuf; + +fn parse_cbor(value: Value) -> Resolved { + let bytes = value.try_bytes()?; + let value = from_reader(bytes.as_slice()).map_err(|e| format!("unable to parse cbor: {e}"))?; + Ok(value) +} + +#[derive(Clone, Copy, Debug)] +pub struct ParseCbor; + +impl Function for ParseCbor { + fn identifier(&self) -> &'static str { + "parse_cbor" + } + + fn summary(&self) -> &'static str { + "parse a string to a CBOR type" + } + + fn usage(&self) -> &'static str { + indoc! {" + Parses the provided `value` as CBOR. + "} + } + + fn examples(&self) -> &'static [Example] { + &[ + Example { + title: "object", + source: r#"parse_cbor!(decode_base64!("oWVmaWVsZGV2YWx1ZQ=="))"#, + result: Ok(r#"{ "field": "value" }"#), + }, + Example { + title: "array", + source: r#"parse_cbor!(decode_base64!("gvUA"))"#, + result: Ok("[true, 0]"), + }, + Example { + title: "string", + source: r#"parse_cbor!(decode_base64!("ZWhlbGxv"))"#, + result: Ok("hello"), + }, + Example { + title: "integer", + source: r#"parse_cbor!(decode_base64!("GCo="))"#, + result: Ok("42"), + }, + Example { + title: "float", + source: r#"parse_cbor!(decode_base64!("+0BFEKPXCj1x"))"#, + result: Ok("42.13"), + }, + Example { + title: "boolean", + source: r#"parse_cbor!(decode_base64!("9A=="))"#, + result: Ok("false"), + }, + ] + } + + fn compile( + &self, + _state: &state::TypeState, + _ctx: &mut FunctionCompileContext, + arguments: ArgumentList, + ) -> Compiled { + let value = arguments.required("value"); + Ok(ParseCborFn { value }.as_expr()) + } + + fn parameters(&self) -> &'static [Parameter] { + &[Parameter { + keyword: "value", + kind: kind::BYTES, + required: true, + }] + } +} + +#[derive(Debug, Clone)] +struct ParseCborFn { + value: Box, +} + +impl FunctionExpression for ParseCborFn { + fn resolve(&self, ctx: &mut Context) -> Resolved { + let value = self.value.resolve(ctx)?; + parse_cbor(value) + } + + fn type_def(&self, _: &state::TypeState) -> TypeDef { + json_type_def() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::value; + use nom::AsBytes; + use std::env; + use std::fs; + use std::path::PathBuf; + + fn test_data_dir() -> PathBuf { + PathBuf::from(env::var_os("CARGO_MANIFEST_DIR").unwrap()).join("tests/data/cbor") + } + + fn read_cbor_file(cbor_bin_message_path: &str) -> Vec { + fs::read(test_data_dir().join(cbor_bin_message_path)).unwrap() + } + + test_function![ + parse_cbor => ParseCbor; + + parses { + args: func_args![ value: value!(read_cbor_file("simple.cbor").as_bytes()) ], + want: Ok(value!({ field: "value" })), + tdef: json_type_def(), + } + + complex_cbor { + args: func_args![ value: value!(read_cbor_file("complex.cbor").as_bytes()) ], + want: Ok(value!({ object: {string: "value", number: 42, array: ["hello", "world"], boolean: false} })), + tdef: json_type_def(), + } + ]; +} diff --git a/src/stdlib/parse_json.rs b/src/stdlib/parse_json.rs index a26316c48f..06ef79b59f 100644 --- a/src/stdlib/parse_json.rs +++ b/src/stdlib/parse_json.rs @@ -6,6 +6,7 @@ use serde_json::{ }; use crate::compiler::prelude::*; +use crate::stdlib::json_utils::json_type_def::json_type_def; fn parse_json(value: Value, lossy: Option) -> Resolved { let lossy = lossy.map(Value::try_boolean).transpose()?.unwrap_or(true); @@ -226,7 +227,7 @@ impl FunctionExpression for ParseJsonFn { } fn type_def(&self, _: &state::TypeState) -> TypeDef { - type_def() + json_type_def() } } @@ -250,31 +251,10 @@ impl FunctionExpression for ParseJsonMaxDepthFn { } fn type_def(&self, _: &state::TypeState) -> TypeDef { - type_def() + json_type_def() } } -fn inner_kind() -> Kind { - Kind::null() - | Kind::bytes() - | Kind::integer() - | Kind::float() - | Kind::boolean() - | Kind::array(Collection::any()) - | Kind::object(Collection::any()) -} - -fn type_def() -> TypeDef { - TypeDef::bytes() - .fallible() - .or_boolean() - .or_integer() - .or_float() - .add_null() - .or_array(Collection::from_unknown(inner_kind())) - .or_object(Collection::from_unknown(inner_kind())) -} - #[cfg(test)] mod tests { use super::*; @@ -286,74 +266,62 @@ mod tests { parses { args: func_args![ value: r#"{"field": "value"}"# ], want: Ok(value!({ field: "value" })), - tdef: type_def(), + tdef: json_type_def(), } complex_json { args: func_args![ value: r#"{"object": {"string":"value","number":42,"array":["hello","world"],"boolean":false}}"# ], want: Ok(value!({ object: {string: "value", number: 42, array: ["hello", "world"], boolean: false} })), - tdef: type_def(), + tdef: json_type_def(), } invalid_json_errors { args: func_args![ value: r#"{"field": "value"# ], want: Err("unable to parse json: EOF while parsing a string at line 1 column 16"), - tdef: TypeDef::bytes().fallible() - .or_boolean() - .or_integer() - .or_float() - .or_null() - .or_array(Collection::from_unknown(inner_kind())) - .or_object(Collection::from_unknown(inner_kind())), + tdef: json_type_def(), } max_depth { args: func_args![ value: r#"{"top_layer": {"layer_one": "finish", "layer_two": 2}}"#, max_depth: 1], want: Ok(value!({ top_layer: r#"{"layer_one": "finish", "layer_two": 2}"# })), - tdef: type_def(), + tdef: json_type_def(), } max_depth_array { args: func_args![ value: r#"[{"top_layer": {"next_layer": ["finish"]}}]"#, max_depth: 2], want: Ok(value!([{ top_layer: r#"{"next_layer": ["finish"]}"# }])), - tdef: type_def(), + tdef: json_type_def(), } max_depth_exceeds_layers { args: func_args![ value: r#"{"top_layer": {"layer_one": "finish", "layer_two": 2}}"#, max_depth: 10], want: Ok(value!({ top_layer: {layer_one: "finish", layer_two: 2} })), - tdef: type_def(), + tdef: json_type_def(), } invalid_json_with_max_depth { args: func_args![ value: r#"{"field": "value"#, max_depth: 3 ], want: Err("unable to read json: EOF while parsing a string at line 1 column 16"), - tdef: TypeDef::bytes().fallible() - .or_boolean() - .or_integer() - .or_float() - .or_null() - .or_array(Collection::from_unknown(inner_kind())) - .or_object(Collection::from_unknown(inner_kind())), + tdef: json_type_def(), } invalid_input_max_depth { args: func_args![ value: r#"{"top_layer": "finish"}"#, max_depth: 129], want: Err("max_depth value should be greater than 0 and less than 128, got 129"), - tdef: type_def(), + tdef: json_type_def(), } // // TODO: provide a function version of the `test_function!` macro. max_int { args: func_args![ value: format!("{{\"num\": {}}}", i64::MAX - 1)], want: Ok(value!({"num": 9_223_372_036_854_775_806_i64})), - tdef: type_def(), + tdef: json_type_def(), } lossy_float_conversion { args: func_args![ value: r#"{"num": 9223372036854775808}"#], want: Ok(value!({"num": 9.223_372_036_854_776e18})), - tdef: type_def(), + tdef: json_type_def(), } // Checks that the parsing uses the default lossy argument value @@ -362,7 +330,7 @@ mod tests { // 0xf5 is out of the range of valid UTF-8 bytes args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22])], want: Ok(value!(std::char::REPLACEMENT_CHARACTER.to_string())), - tdef: type_def(), + tdef: json_type_def(), } parse_invalid_utf8_lossy_arg_true { @@ -370,13 +338,13 @@ mod tests { args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22]), lossy: true], // U+FFFD is the replacement character for invalid UTF-8 want: Ok(value!(std::char::REPLACEMENT_CHARACTER.to_string())), - tdef: type_def(), + tdef: json_type_def(), } invalid_utf8_json_lossy_arg_false { args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22]), lossy: false], want: Err("unable to parse json: invalid unicode code point at line 1 column 3"), - tdef: type_def(), + tdef: json_type_def(), } ]; @@ -387,7 +355,7 @@ mod tests { no_roundtrip_float_conversion { args: func_args![ value: r#"{"num": 1626175065.5934923}"#], want: Ok(value!({"num": 1_626_175_065.593_492_5})), - tdef: type_def(), + tdef: json_type_def(), } ]; @@ -398,7 +366,7 @@ mod tests { roundtrip_float_conversion { args: func_args![ value: r#"{"num": 1626175065.5934923}"#], want: Ok(value!({"num": 1_626_175_065.593_492_3})), - tdef: type_def(), + tdef: json_type_def(), } ]; } diff --git a/src/stdlib/parse_proto.rs b/src/stdlib/parse_proto.rs index 0c55f29307..aacbe199e1 100644 --- a/src/stdlib/parse_proto.rs +++ b/src/stdlib/parse_proto.rs @@ -1,6 +1,7 @@ use crate::compiler::prelude::*; use crate::protobuf::get_message_descriptor; use crate::protobuf::parse_proto; +use crate::stdlib::json_utils::json_type_def::json_type_def; use once_cell::sync::Lazy; use prost_reflect::MessageDescriptor; use std::env; @@ -112,31 +113,10 @@ impl FunctionExpression for ParseProtoFn { } fn type_def(&self, _: &state::TypeState) -> TypeDef { - type_def() + json_type_def() } } -fn inner_kind() -> Kind { - Kind::null() - | Kind::bytes() - | Kind::integer() - | Kind::float() - | Kind::boolean() - | Kind::array(Collection::any()) - | Kind::object(Collection::any()) -} - -fn type_def() -> TypeDef { - TypeDef::bytes() - .fallible() - .or_boolean() - .or_integer() - .or_float() - .add_null() - .or_array(Collection::from_unknown(inner_kind())) - .or_object(Collection::from_unknown(inner_kind())) -} - #[cfg(test)] mod tests { use super::*; @@ -159,7 +139,7 @@ mod tests { desc_file: test_data_dir().join("test_protobuf.desc").to_str().unwrap().to_owned(), message_type: "test_protobuf.Person"], want: Ok(value!({ name: "someone", phones: [{number: "123456"}] })), - tdef: type_def(), + tdef: json_type_def(), } parses_proto3 { @@ -167,7 +147,7 @@ mod tests { desc_file: test_data_dir().join("test_protobuf3.desc").to_str().unwrap().to_owned(), message_type: "test_protobuf3.Person"], want: Ok(value!({ data: {data_phone: "HOME"}, name: "someone", phones: [{number: "1234", type: "MOBILE"}] })), - tdef: type_def(), + tdef: json_type_def(), } ]; } diff --git a/tests/data/cbor/complex.cbor b/tests/data/cbor/complex.cbor new file mode 100644 index 0000000000..6f4bfb247c --- /dev/null +++ b/tests/data/cbor/complex.cbor @@ -0,0 +1 @@ +¡fobject¤fstringevaluefnumber*earray‚ehelloeworldgbooleanô \ No newline at end of file diff --git a/tests/data/cbor/simple.cbor b/tests/data/cbor/simple.cbor new file mode 100644 index 0000000000..4381f17dc5 --- /dev/null +++ b/tests/data/cbor/simple.cbor @@ -0,0 +1 @@ +¡efieldevalue \ No newline at end of file