diff --git a/.gitignore b/.gitignore index e57cc32e..a67cb447 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ /go ebpf-profiler ci-kernels +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..7fea342f --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,686 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bytes" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" + +[[package]] +name = "cc" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "cpp_demangle" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8227005286ec39567949b33df9896bcadfa6051bccca2488129f108ca23119" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "either" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flate2" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "gimli" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e1d97fbe9722ba9bbd0c97051c2956e726562b61f86a25a4360398a40edfc9" +dependencies = [ + "fallible-iterator", + "stable_deref_trait", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "intervaltree" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "270bc34e57047cab801a8c871c124d9dc7132f6473c6401f645524f4e6edd111" +dependencies = [ + "smallvec", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "jobserver" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" + +[[package]] +name = "memchr" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" + +[[package]] +name = "memmap2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" +dependencies = [ + "libc", +] + +[[package]] +name = "miniz_oxide" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae" +dependencies = [ + "adler", +] + +[[package]] +name = "multimap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" + +[[package]] +name = "object" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "prettyplease" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "symblib" +version = "0.0.0" +dependencies = [ + "anyhow", + "base64", + "cpp_demangle", + "fallible-iterator", + "flate2", + "gimli", + "intervaltree", + "lru", + "memmap2", + "object", + "prost", + "prost-build", + "rustc-demangle", + "sha2", + "smallvec", + "tempfile", + "thiserror", + "zstd", + "zydis", +] + +[[package]] +name = "symblib-capi" +version = "0.0.0" +dependencies = [ + "fallible-iterator", + "symblib", + "thiserror", +] + +[[package]] +name = "syn" +version = "2.0.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys", +] + +[[package]] +name = "thiserror" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + +[[package]] +name = "zstd" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.10+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "zydis" +version = "4.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0395dcbec9d43ff14811624d4876db7a4a51d1ed73ce3f9e89d14a7e4eeb9ae1" +dependencies = [ + "bitflags", + "cmake", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..831c6628 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,85 @@ +# Rust workspace. Allows command like `cargo test` to work anywhere within the +# repo and ensures that all components use the same dependency versions +# (global Cargo.lock). + +[workspace] +members = [ + "rust-crates/symblib", + "rust-crates/symblib-capi", +] +resolver = "2" + +[workspace.package] +version = "0.0.0" +rust-version = "1.77" + +[profile.release] +lto = "thin" +codegen-units = 1 +panic = "abort" +opt-level = 3 +strip = "debuginfo" + +[profile.release-unstripped] +inherits = "release" +strip = false +debug = 1 + +[profile.release-with-asserts] +inherits = "release-unstripped" +overflow-checks = true +debug-assertions = true + +[profile.test] +opt-level = 1 # default of 0 is annoyingly slow + +[workspace.dependencies] +anyhow = "1" +argh = "0.1" +base64 = "0.22.0" +cpp_demangle = "0.4" +fallible-iterator = "0.3" +flate2 = "1" +memmap2 = "0.9.0" +native-tls = "0.2" +prost = "0.12.1" +prost-build = "0.12.1" +rustc-demangle = "0.1" +serde_json = "1" +sha2 = "0.10" +tempfile = "3" +thiserror = "1" +zstd = "0.13.0" +zydis = "4.1.1" + +[workspace.dependencies.gimli] +version = "0.30.0" +default-features = false +features = ["std", "endian-reader", "fallible-iterator"] + +[workspace.dependencies.intervaltree] +version = "0.2" +default-features = false +features = ["std"] + +[workspace.dependencies.lru] +version = "0.12.0" +default-features = false + +[workspace.dependencies.object] +version = "0.36.0" +default-features = false +features = ["std", "read_core", "elf", "macho", "unaligned"] + +[workspace.dependencies.serde] +version = "1" +features = ["derive"] + +[workspace.dependencies.smallvec] +version = "1" +features = ["const_new", "union", "const_generics", "write"] + +[workspace.dependencies.ureq] +version = "2" +default-features = false +features = ["gzip", "native-tls", "native-certs"] diff --git a/rust-crates/README.md b/rust-crates/README.md new file mode 100644 index 00000000..93ec4146 --- /dev/null +++ b/rust-crates/README.md @@ -0,0 +1,69 @@ +Rust components +=============== + +This directory contains the Rust components for symbolization of native traces. +They are built using the `cargo` build system. Please refer to the README +documents in the subdirectories for details. + +## Source code documentation + +> [!TIP] +> +> If you're trying to familiarize yourself with the codebase, this is heavily +> recommended. All the important documentation and `README`s are included into +> the rustdoc built documentation, and the generated doc is much more structured +> than what you'd get by just browsing through the repository. + +The source code is extensively documented with `rustdoc`, which is invoked +through cargo. + +```bash +# Build documentation for our Rust crates and open it in a browser window +cargo doc --document-private-items --workspace --open +``` + +By default, this will open the documentation for `symblib`. + +## Import style + +Whenever the name of a type or function that is being imported isn't necessarily +unique, we instead import the module that contains it and then use the module +name to qualify the access. This is essentially similar to how things are done +in Golang. + +If the item being important has a very significant, unique name within the code- +base, it's also acceptable to import (`use`) that type directly and refer to it +without additional qualification. + +
+Examples + +There are many different modules that expose `File` and `Range` types. Import +the module instead and qualify the items with `module::item`. + +```rust +use std::fs; +use symblib::objfile; + +let a: fs::File = todo!(); +let b: objfile::File = todo!(); +``` + +```rust +use std::ops; +use symblib::symbfile; + +let a: ops::Range = todo!(); +let b: symbfile::Range = todo!(); +``` + +`GoRuntimeInfo` is a very unique name that is unlikely to cause confusion even +without further qualification. Import item directly. + +```rust +use symblib::gosym::GoRuntimeInfo; + +let a: GoRuntimeInfo<'static> = todo!(); +``` + +
diff --git a/doc/symb-proto/README.md b/rust-crates/symb-proto/README.md similarity index 97% rename from doc/symb-proto/README.md rename to rust-crates/symb-proto/README.md index cbc1ec09..cc00b70a 100644 --- a/doc/symb-proto/README.md +++ b/rust-crates/symb-proto/README.md @@ -1,7 +1,5 @@ -Elastic symbolization protocol -============================== - -## `symbfile` format +symbfile format +=============== `symbfile` is our custom file format for efficiently storing large amounts of symbol information. A symbfile is a concatenation of length- and message-type @@ -20,7 +18,7 @@ We currently use two different symbol information representations: given address, the user would sweep though the whole symbfile and collect all ranges that contain the desired address and then order the resulting range records by their `depth` field. This presents the ground truth for symbol - information. + information. - **Return pad records ([`ReturnPadV1`])**\ These map a single address to the symbols of a full inline trace. We generate such records for each instruction following a `call`. The idea here is that @@ -31,7 +29,7 @@ We currently use two different symbol information representations: While the symbfile format would generally also allow mixing both record types into a single file, we currently always generate a separate symbfile per record -kind. +kind. More details about the format itself can be found in the documentation comments of the [protobuf definition][symbfile-proto]. @@ -95,4 +93,4 @@ explains the failure in greater detail, for example: `uuid` allows logically connecting user reports and logs: error reports from the user that contain the UUID allow finding the logs needed for -investigation and debugging. \ No newline at end of file +investigation and debugging. diff --git a/doc/symb-proto/symbfile.proto b/rust-crates/symb-proto/symbfile.proto similarity index 100% rename from doc/symb-proto/symbfile.proto rename to rust-crates/symb-proto/symbfile.proto diff --git a/rust-crates/symblib-capi/.gitignore b/rust-crates/symblib-capi/.gitignore new file mode 100644 index 00000000..9c318041 --- /dev/null +++ b/rust-crates/symblib-capi/.gitignore @@ -0,0 +1,2 @@ +c/demo +go/go diff --git a/rust-crates/symblib-capi/Cargo.toml b/rust-crates/symblib-capi/Cargo.toml new file mode 100644 index 00000000..60f098db --- /dev/null +++ b/rust-crates/symblib-capi/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "symblib-capi" +edition = "2021" +version.workspace = true +rust-version.workspace = true + +[lib] +crate-type = ["staticlib", "cdylib"] + +[dependencies] +symblib.path = "../symblib" + +fallible-iterator.workspace = true +thiserror.workspace = true diff --git a/rust-crates/symblib-capi/README.md b/rust-crates/symblib-capi/README.md new file mode 100644 index 00000000..8c287789 --- /dev/null +++ b/rust-crates/symblib-capi/README.md @@ -0,0 +1,4 @@ +symblib C API +============= + +This crate exposes the public core API of symblib as a C library. diff --git a/rust-crates/symblib-capi/c/Makefile b/rust-crates/symblib-capi/c/Makefile new file mode 100644 index 00000000..f349fed6 --- /dev/null +++ b/rust-crates/symblib-capi/c/Makefile @@ -0,0 +1,19 @@ +.PHONY: all clean run-demo + +RUST_WORKSPACE_DIR = ../../.. +TARGET_DIR = $(RUST_WORKSPACE_DIR)/target/release + +all: demo + +$(TARGET_DIR)/libsymblib_capi.so: ../src/*.rs + cargo build --release --manifest-path $(RUST_WORKSPACE_DIR)/Cargo.toml + +demo: symblib.h demo.c $(TARGET_DIR)/libsymblib_capi.so + cc -g -I. -o $@ demo.c -L$(TARGET_DIR) -lsymblib_capi -ldl + +run-demo: demo + LD_LIBRARY_PATH=$(TARGET_DIR) ./demo + +clean: + cargo clean --manifest-path $(RUST_CRATE_DIR)/Cargo.toml + rm -f demo diff --git a/rust-crates/symblib-capi/c/demo.c b/rust-crates/symblib-capi/c/demo.c new file mode 100644 index 00000000..23e8702f --- /dev/null +++ b/rust-crates/symblib-capi/c/demo.c @@ -0,0 +1,96 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include "symblib.h" + +// Example visitor callback for processing return pads +SymblibStatus retpad_visitor(void* user_data, const SymblibReturnPad* ret_pad) { + printf("\nReturn pad at ELF VA: 0x%08" PRIx64 "\n", ret_pad->elf_va); + + // Iterate over each entry in the SymblibReturnPad + for (size_t i = 0; i < ret_pad->entries.len; ++i) { + SymblibReturnPadEntry* entry = &((SymblibReturnPadEntry*)ret_pad->entries.data)[i]; + printf("\tEntry %zu:\n", i); + printf("\t\tFunction: %s\n", entry->func ? entry->func : "(null)"); + printf("\t\tFile: %s\n", entry->file ? entry->file : "(null)"); + printf("\t\tLine: %u\n", entry->line); + } + + return 0; +} + +// Example visitor callback for processing ranges +SymblibStatus range_visitor(void* user_data, const SymblibRange* range) { + printf("\nSymbol range at ELF VA: 0x08%" PRIx64 "\n", range->elf_va); + printf("\tFunction: %s\n", range->func); + printf("\tFile: %s\n", range->file ? range->file : "(null)"); + printf("\tCall File: %s\n", range->call_file ? range->call_file : "(null)"); + printf("\tCall Line: %u\n", range->call_line); + printf("\tDepth: %u\n", range->depth); + printf("\tLine Table Length: %zu\n", range->line_table.len); + + // Submit the range to the return pad extractor. + SymblibStatus err = symblib_retpadextr_submit( + (SymblibRetPadExtractor*)user_data, range, retpad_visitor, NULL); + if (err != SYMBLIB_OK) { + fprintf(stderr, "Failed to submit range for extraction\n"); + return err; + } + + return 0; +} + +int main(int argc, const char** argv) { + const char* executable; + + switch (argc) { + case 0: + return EXIT_FAILURE; + case 1: + // Use this binary. + executable = argv[0]; + break; + default: + // Use user-passed file. + executable = argv[1]; + } + + printf("Starting range extraction for executable: %s\n", executable); + + // Initialize the global return pad extractor. + // We use it in the range extractor visitor. + SymblibRetPadExtractor* extr = NULL; + SymblibStatus err = symblib_retpadextr_new(executable, &extr); + if (err != SYMBLIB_OK) { + fprintf(stderr, "Failed to create global SymblibRetPadExtractor\n"); + return EXIT_FAILURE; + } + assert(extr != NULL); + + // Call the range extraction function with our visitor. + err = symblib_rangeextr(executable, false, range_visitor, extr); + if (err != SYMBLIB_OK) { + fprintf(stderr, "Error during range extraction: %d\n", err); + symblib_retpadextr_free(extr); + return EXIT_FAILURE; + } + + // Notify the return pad extractor that we're done. + err = symblib_retpadextr_submit(extr, NULL, retpad_visitor, NULL); + if (err != SYMBLIB_OK) { + fprintf(stderr, "Failed to submit end-of-ranges marker\n"); + symblib_retpadextr_free(extr); + return err; + } + + printf("\nRange extraction completed successfully.\n"); + + symblib_retpadextr_free(extr); + return EXIT_SUCCESS; +} diff --git a/rust-crates/symblib-capi/c/symblib.h b/rust-crates/symblib-capi/c/symblib.h new file mode 100644 index 00000000..df3c2be0 --- /dev/null +++ b/rust-crates/symblib-capi/c/symblib.h @@ -0,0 +1,145 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SYMBLIB_H +#define SYMBLIB_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum SymblibStatus { + SYMBLIB_OK = 0, + SYMBLIB_ERR_IOMISC = 1, + SYMBLIB_ERR_IOFILENOTFOUND = 2, + SYMBLIB_ERR_OBJFILE = 3, + SYMBLIB_ERR_DWARF = 4, + SYMBLIB_ERR_SYMBCONV = 5, + SYMBLIB_ERR_RETPAD = 6, + SYMBLIB_ERR_BADUTF8 = 7, + SYMBLIB_ERR_ALREADYCLOSED = 8, +} SymblibStatus; + +// Opaque handle to a return pad extractor. +typedef struct SymblibRetPadExtractor SymblibRetPadExtractor; + +// Rust managed string. +typedef const char* SymblibString; + +// Array of objects. +typedef struct { + // Pointer to the first item in the slice. + // + // May or may not be NULL if `len == 0`: don't rely on it. + const void* data; + + // Number of entries in the slice. + size_t len; +} SymblibSlice; + +// Entry in the return pad inline trace. +// +// See symbfile.proto for details. +typedef struct { + SymblibString func; // never null + SymblibString file; // may be null + uint32_t line; // 0 = unknown +} SymblibReturnPadEntry; + +// Symbol info for a return pad. +// +// See symbfile.proto for details. +typedef struct { + uint64_t elf_va; + SymblibSlice/**/ entries; +} SymblibReturnPad; + +// Symbol info for a PC range. +// +// See symbfile.proto for details. +typedef struct { + uint64_t elf_va; + uint32_t length; + SymblibString func; + SymblibString file; + SymblibString call_file; + uint32_t call_line; + uint32_t depth; + SymblibSlice/**/ line_table; + // Omitted internal Rust-specific field rust_range +} SymblibRange; + +// Entry in a range's line table. +// +// See symbfile.proto for details. +typedef struct { + uint32_t offset; + uint32_t line_number; +} SymblibLineTableEntry; + +// Visitor callback for extracted ranges. +// +// The range is **borrowed** to the callee and the pointer is only valid for +// the duration of the visitor call. Returning an error will abort further +// execution and return early. +typedef SymblibStatus (*SymblibRangeVisitor)(void* user_data, const SymblibRange*); + +// Visitor callback for return pads. +// +// The return pad is **borrowed** to the callee and the pointer is only valid +// for the duration of the visitor call. Returning an error will abort further +// execution and return early. +typedef SymblibStatus (*SymblibRetPadVisitor)(void* user_data, const SymblibReturnPad*); + +// Extract ranges from an executable. +// +// This creates a range extractor with all supported debug symbol formats. The +// extractor is then run to completion and the visitor is invoked for every +// range that is found in the executable. The user_data pointer is passed to +// the visitor untouched and may be NULL. +extern SymblibStatus symblib_rangeextr( + const char* executable, + bool follow_alt_link, + SymblibRangeVisitor visitor, + void* user_data +); + +// Create a new return pad extractor. +// +// The instance must be freed via a call to `symblib_retpadextr_free`. +extern SymblibStatus symblib_retpadextr_new( + const char* executable, + SymblibRetPadExtractor** extr +); + +// Submit a new range to the return pad extractor. +// +// The callback may be invoked 0..n times for each range submitted. Processing +// is happening asynchronously in the background: there is no guarantee that +// the return pads passed to the visitor at each call correspond to the range +// that was just submitted. +// +// The user_data pointer is passed to the visitor untouched and may be NULL. +// +// Once all ranges have been submitted, call this function with a `NULL` range +// once to indicate this, forcing all remaining buffered return pads to be +// flushed. +extern SymblibStatus symblib_retpadextr_submit( + SymblibRetPadExtractor* extr, + const SymblibRange* range, + SymblibRetPadVisitor visitor, + void* user_data +); + +// Frees a return pad extractor. +extern void symblib_retpadextr_free(SymblibRetPadExtractor* extr); + +#ifdef __cplusplus +} +#endif + +#endif // SYMBLIB_H diff --git a/rust-crates/symblib-capi/go/Makefile b/rust-crates/symblib-capi/go/Makefile new file mode 100644 index 00000000..f6d2e320 --- /dev/null +++ b/rust-crates/symblib-capi/go/Makefile @@ -0,0 +1,13 @@ +.PHONY: all clean + +all: + CGO_ENABLED=1 \ + go build \ + -mod=readonly \ + -ldflags='-linkmode external -extldflags=-static' \ + -trimpath \ + -tags 'static_build' + + +clean: + go clean diff --git a/rust-crates/symblib-capi/go/main.go b/rust-crates/symblib-capi/go/main.go new file mode 100644 index 00000000..17ccd33f --- /dev/null +++ b/rust-crates/symblib-capi/go/main.go @@ -0,0 +1,96 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package main + +/* +#cgo LDFLAGS: ${SRCDIR}/../../../target/release/libsymblib_capi.a +#cgo CFLAGS: -g -Wall +#include "../c/symblib.h" +#include + +// Declare wrapper functions for linking. +SymblibStatus rangeVisitorWrapper(void* user_data, SymblibRange* range); +SymblibStatus retPadVisitorWrapper(void* user_data, SymblibReturnPad* ret_pad); +*/ +import "C" +import ( + "fmt" + "os" + "unsafe" +) + +//export retPadVisitorWrapper +func retPadVisitorWrapper(_ unsafe.Pointer, retPadPtr *C.SymblibReturnPad) C.SymblibStatus { + // Process the return pad data + elfVA := uint64(retPadPtr.elf_va) + entriesCount := int(retPadPtr.entries.len) + fmt.Printf("Return Pad: ELF VA: 0x%x, Entries: %d\n", elfVA, entriesCount) + + return C.SYMBLIB_OK +} + +//export rangeVisitorWrapper +func rangeVisitorWrapper(userData unsafe.Pointer, rangePtr *C.SymblibRange) C.SymblibStatus { + elfVA := uint64(rangePtr.elf_va) + length := uint32(rangePtr.length) + file := C.GoString(rangePtr.file) + // cgo transforms the field func in SymblibRange to _func + // as func is a reserved keyword in Go. + function := C.GoString(rangePtr._func) + + fmt.Printf("Range: ELF VA: 0x%x, Length: %d, Function: %s File: %s\n", + elfVA, length, function, file) + + return C.symblib_retpadextr_submit( + (*C.SymblibRetPadExtractor)(userData), + rangePtr, + C.SymblibRetPadVisitor(C.retPadVisitorWrapper), + nil, + ) +} + +func mainWithExitCode() int { + // For the purpose of demonstration symbolize the executable themselves. + executablePath := C.CString(os.Args[0]) + defer C.free(unsafe.Pointer(executablePath)) + + // Initialize the global return pad extractor. + // We use it in the range extractor visitor. + var extractor *C.SymblibRetPadExtractor + + //nolint:gocritic + status := C.symblib_retpadextr_new(executablePath, &extractor) + if status != C.SYMBLIB_OK { + fmt.Fprintf(os.Stderr, "Failed to create return pad extractor: %d\n", status) + return 1 + } + defer C.symblib_retpadextr_free(extractor) + + // Call the range extraction function with our visitor. + status = C.symblib_rangeextr( + executablePath, + C.bool(true), + C.SymblibRangeVisitor(C.rangeVisitorWrapper), + unsafe.Pointer(extractor), + ) + if status != C.SYMBLIB_OK { + fmt.Fprintf(os.Stderr, "Failed to extract ranges: %d\n", status) + return 1 + } + + // Notify the return pad extractor that we're done. + status = C.symblib_retpadextr_submit(extractor, nil, + C.SymblibRetPadVisitor(C.retPadVisitorWrapper), nil) + if status != C.SYMBLIB_OK { + fmt.Fprintf(os.Stderr, "Failed to notify retpad extractor: %d\n", status) + return 1 + } + + fmt.Println("Ranges extracted successfully") + return 0 +} + +func main() { + os.Exit(mainWithExitCode()) +} diff --git a/rust-crates/symblib-capi/src/ffislice.rs b/rust-crates/symblib-capi/src/ffislice.rs new file mode 100644 index 00000000..06a825a1 --- /dev/null +++ b/rust-crates/symblib-capi/src/ffislice.rs @@ -0,0 +1,62 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use std::{marker::PhantomData, mem, ptr, slice}; + +/// Read-only, owned FFI-safe owned slice type. +/// +/// The caller must ensure that `T` is FFI-safe (`#[repr(C)]`). +#[repr(C)] +#[derive(Debug)] +pub struct SymblibSlice { + /// Data pointer. + /// + /// May or may not be null for empty slices: don't rely on it. + data: *mut T, + + /// Number of entries in the slice. + len: usize, + + /// Make compiler print warnings if `T` isn't FFI-safe. + _marker: PhantomData, +} + +impl From> for SymblibSlice { + fn from(vec: Vec) -> Self { + let mut s = vec.into_boxed_slice(); + let data = s.as_mut_ptr(); + let len = s.len(); + mem::forget(s); + + Self { + data, + len, + _marker: PhantomData, + } + } +} + +impl From> for Box<[T]> { + fn from(s: SymblibSlice) -> Self { + unsafe { + let std_slice = slice::from_raw_parts_mut(s.data, s.len); + mem::forget(s); + Box::<[T]>::from_raw(std_slice) + } + } +} + +impl From> for Vec { + fn from(s: SymblibSlice) -> Self { + Vec::from(Box::<[T]>::from(s)) + } +} + +impl Drop for SymblibSlice { + fn drop(&mut self) { + // Drop by converting to boxed slice and then dropping the slice. + drop(Box::<[T]>::from(unsafe { ptr::read(self) })); + } +} + +unsafe impl Send for SymblibSlice {} diff --git a/rust-crates/symblib-capi/src/ffistr.rs b/rust-crates/symblib-capi/src/ffistr.rs new file mode 100644 index 00000000..9fc0eb88 --- /dev/null +++ b/rust-crates/symblib-capi/src/ffistr.rs @@ -0,0 +1,48 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use std::ffi::{c_char, CString}; +use std::{mem, ptr}; + +/// Read-only, nullable, owned FFI-safe string type. +#[derive(Debug)] +#[repr(transparent)] +pub struct SymblibString(*mut c_char); + +impl From> for SymblibString { + fn from(maybe_str: Option) -> Self { + match maybe_str { + Some(s) => s.into(), + None => SymblibString(ptr::null_mut()), + } + } +} + +impl From for SymblibString { + fn from(x: String) -> Self { + Self(unsafe { CString::from_vec_unchecked(x.into_bytes()).into_raw() }) + } +} + +impl From for Option { + fn from(maybe_str: SymblibString) -> Self { + if maybe_str.0.is_null() { + None + } else { + let cstr = unsafe { CString::from_raw(maybe_str.0) }; + mem::forget(maybe_str); + Some(cstr.into_string().unwrap()) + } + } +} + +impl Drop for SymblibString { + fn drop(&mut self) { + if !self.0.is_null() { + drop(unsafe { CString::from_raw(self.0 as _) }); + self.0 = ptr::null_mut(); + } + } +} + +unsafe impl Send for SymblibString {} diff --git a/rust-crates/symblib-capi/src/lib.rs b/rust-crates/symblib-capi/src/lib.rs new file mode 100644 index 00000000..b6664ad7 --- /dev/null +++ b/rust-crates/symblib-capi/src/lib.rs @@ -0,0 +1,16 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#![doc = include_str!("../README.md")] + +mod ffislice; +mod ffistr; +mod rangeextr; +mod retpadextr; +mod status; + +pub use ffislice::*; +pub use ffistr::*; +pub use rangeextr::*; +pub use retpadextr::*; +pub use status::*; diff --git a/rust-crates/symblib-capi/src/rangeextr.rs b/rust-crates/symblib-capi/src/rangeextr.rs new file mode 100644 index 00000000..3fc44469 --- /dev/null +++ b/rust-crates/symblib-capi/src/rangeextr.rs @@ -0,0 +1,216 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use super::{FfiResult, StatusCode, SymblibSlice, SymblibString}; +use std::ffi::{c_char, c_void, CStr, OsString}; +use std::os::unix::ffi::OsStringExt as _; +use std::path::{Path, PathBuf}; +use symblib::objfile::{self, SymbolSource}; +use symblib::symbconv::RangeExtractor as _; +use symblib::{dwarf, symbconv as sc, symbfile, VirtAddr}; + +/// Extract ranges from an executable. +/// +/// This creates a [`symblib::symbconv::multi`] extractor with all supported +/// debug symbol formats registered with the following priority: +/// +/// 1) DWARF +/// 2) Go symbols +/// 3) ELF debug symbols +/// 4) ELF dynamic symbols +/// +/// This extractor is then run to completion and the visitor is invoked for +/// every range found in the executable. The user_data pointer is passed to +/// the visitor untouched and may be NULL. +#[no_mangle] +pub unsafe extern "C" fn symblib_rangeextr( + executable: *const c_char, + follow_alt_link: bool, + visitor: SymblibRangeVisitor, + user_data: *mut c_void, +) -> StatusCode { + match rangeextr_impl(executable, follow_alt_link, visitor, user_data) { + Ok(()) => StatusCode::Ok, + Err(e) => e, + } +} + +unsafe fn rangeextr_impl( + executable: *const c_char, + follow_alt_link: bool, + visitor: SymblibRangeVisitor, + user_data: *mut c_void, +) -> FfiResult { + assert!(!executable.is_null()); + + let executable = Path::new(unsafe { CStr::from_ptr(executable).to_str()? }); + + // Open and mmap main object file. + let obj = objfile::File::load(executable)?; + let obj_reader = obj.parse()?; + + // Resolve and use alt link, if requested by caller. + let mut sup_obj_path: Option = None; + if follow_alt_link { + sup_obj_path = match resolve_alt_link(executable, &obj_reader) { + Ok(x) => x, + Err(StatusCode::IoFileNotFound) => None, + Err(other) => return Err(other), + } + } + + // Load DWARF sections. + let mut dw = dwarf::Sections::load(&obj_reader)?; + + // If a supplementary path was found, load its data. + let sup_obj; + let sup_reader; + if let Some(sup_obj_path) = sup_obj_path { + sup_obj = objfile::File::load(&sup_obj_path)?; + sup_reader = sup_obj.parse()?; + dw.load_sup(&sup_reader)?; + } + + let mut extr = sc::multi::Extractor::new(&obj_reader)?; + extr.add("dwarf", sc::dwarf::Extractor::new(&dw)); + extr.add("go", sc::go::Extractor::new(&obj_reader)); + extr.add( + "dbg-obj-sym", + sc::obj::Extractor::new(&obj_reader, SymbolSource::Debug), + ); + extr.add( + "dyn-obj-sym", + sc::obj::Extractor::new(&obj_reader, SymbolSource::Dynamic), + ); + + // Run the extractor with the user's callback. + let result = extr.extract(&mut |rng| { + let ffi_rng = SymblibRange::from(rng); + match visitor(user_data, &ffi_rng) { + StatusCode::Ok => Ok(()), + code => Err(Box::new(code)), + } + }); + + // Extract the error code from the visitor error branches. + match result { + Ok(_) => Ok(()), + Err( + sc::Error::Dwarf(sc::dwarf::Error::Visitor(v)) + | sc::Error::Go(sc::go::Error::Visitor(v)) + | sc::Error::Obj(v), + ) => Err(v + .downcast::() + .map(|x| *x) + .unwrap_or(StatusCode::Symbconv)), + Err(_) => Err(StatusCode::Symbconv), + } +} + +fn resolve_alt_link(exec_path: &Path, obj: &objfile::Reader) -> FfiResult> { + let alt_link = obj.gnu_debug_alt_link()?; + + let Some(alt_link) = alt_link else { + return Ok(None); + }; + + // Turn array of bytes into a proper path. + let alt_path = OsString::from_vec(alt_link.path); + let alt_path = PathBuf::from(alt_path); + + if alt_path.is_absolute() { + return Ok(Some(alt_path)); + } + + Ok(Some( + exec_path + .canonicalize()? + .parent() + .expect("absolute file path should always have a parent") + .join(&alt_path), + )) +} + +/// Visitor callback for extracted ranges. +/// +/// The range is **borrowed** to the callee and the pointer is only valid for +/// the duration of the visitor call. Returning an error will abort further +/// execution and return early. +pub type SymblibRangeVisitor = + unsafe extern "C" fn(user_data: *mut c_void, range: *const SymblibRange) -> StatusCode; + +/// FFI-safe variant of [`symbfile::Range`]. +#[repr(C)] +#[derive(Debug)] +pub struct SymblibRange { + pub elf_va: VirtAddr, + pub length: u32, + pub func: SymblibString, // never null + pub file: SymblibString, // may be null + pub call_file: SymblibString, // may be null + pub call_line: u32, // 0 = unknown + pub depth: u32, + pub line_table: SymblibSlice, + + // Internal, for return pad code use. + pub(crate) rust_range: Box, +} + +impl From for SymblibRange { + fn from(rng: symbfile::Range) -> Self { + let rust_range = Box::new(rng.clone()); + let table: Vec = + rng.line_table.into_iter().map(Into::into).collect(); + + Self { + elf_va: rng.elf_va, + length: rng.length, + func: rng.func.into(), + file: rng.file.into(), + call_file: rng.call_file.into(), + call_line: rng.call_line.unwrap_or(0), + depth: rng.depth, + line_table: SymblibSlice::from(table), + rust_range, + } + } +} + +/// FFI-safe variant of [`symbfile::LineTableEntry`]. +#[repr(C)] +#[derive(Debug)] +pub struct SymblibLineTableEntry { + pub offset: u32, + pub line_number: u32, +} + +impl From for SymblibLineTableEntry { + fn from(entry: symbfile::LineTableEntry) -> Self { + Self { + offset: entry.offset, + line_number: entry.line_number, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ptr; + + #[test] + fn rangeextr() { + let file = c"../symblib/testdata/inline"; + + extern "C" fn visitor(_: *mut c_void, rng: *const SymblibRange) -> StatusCode { + assert_ne!(rng, ptr::null()); + dbg!(unsafe { &*rng }); + StatusCode::Ok + } + + assert_eq!( + unsafe { symblib_rangeextr(file.as_ptr(), false, visitor, ptr::null_mut()) }, + StatusCode::Ok + ); + } +} diff --git a/rust-crates/symblib-capi/src/retpadextr.rs b/rust-crates/symblib-capi/src/retpadextr.rs new file mode 100644 index 00000000..137f5edc --- /dev/null +++ b/rust-crates/symblib-capi/src/retpadextr.rs @@ -0,0 +1,281 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Expose return pad generation to C. +//! +//! This currently uses a background thread to do the processing because that +//! is the easiest way to bridge the push-based range extraction API with +//! the pull-based iterator API consumed by the return pad generation code. +//! It may simplify things in the future if we were to rework the return pad +//! generation code to be push-based as well. + +use super::{SymblibSlice, SymblibString}; +use crate::{FfiResult, StatusCode, SymblibRange}; +use fallible_iterator::{FallibleIterator, IteratorExt as _}; +use std::ffi::{c_char, c_void, CStr}; +use std::path::Path; +use std::sync::mpsc::TrySendError; +use std::thread::JoinHandle; +use std::{sync::mpsc, thread}; +use symblib::{objfile, retpads, symbfile, VirtAddr}; + +/// Create a new return pad extractor. +/// +/// The instance must be freed via a call to [`symblib_retpadextr_free`]. +#[no_mangle] +pub unsafe extern "C" fn symblib_retpadextr_new( + executable: *const c_char, + extr: *mut *mut SymblibRetPadExtractor, // out arg +) -> StatusCode { + match retpadextr_new_impl(executable, extr) { + Ok(()) => StatusCode::Ok, + Err(e) => e, + } +} + +unsafe fn retpadextr_new_impl( + executable: *const c_char, + extr: *mut *mut SymblibRetPadExtractor, // out arg +) -> FfiResult { + assert!(!executable.is_null()); + let executable = CStr::from_ptr(executable) + .to_str() + .map(Path::new) + .map_err(|_| StatusCode::BadUtf8)?; + + // Open and mmap main object file. + let obj = objfile::File::load(Path::new(executable))?; + let (range_tx, range_rx) = mpsc::sync_channel(128); + let (ret_pad_tx, ret_pad_rx) = mpsc::sync_channel(128); + + let thread_handle = Some(thread::spawn(move || { + extractor_thread(obj, range_rx, ret_pad_tx) + })); + + *extr = Box::into_raw(Box::new(SymblibRetPadExtractor { + thread_handle, + ret_pad_rx, + range_tx: Some(range_tx), + })); + + Ok(()) +} + +fn extractor_thread( + obj: objfile::File, + range_rx: mpsc::Receiver, + ret_pad_tx: mpsc::SyncSender, +) -> FfiResult { + let obj_reader = obj.parse()?; + + let range_iter = range_rx + .into_iter() + .into_fallible() + .map_err(|_| -> retpads::Error { unreachable!("source iterator is infallible") }); + + retpads::extract_retpads(&obj_reader, range_iter, |ret_pad| { + ret_pad_tx + .send(SymblibReturnPad::from(ret_pad)) + .map_err(|_| retpads::Error::Other(std::io::Error::other("TODO").into())) + })?; + + Ok(()) +} + +/// Visitor callback for symbol events. +/// +/// The return pad is **borrowed** to the callee and the pointer is only valid +/// for the duration of the visitor call. Returning an error will abort further +/// execution and return early. +pub type RetPadVisitor = + unsafe extern "C" fn(user_data: *mut c_void, ret_pad: *const SymblibReturnPad) -> StatusCode; + +/// Submit a new range to the return pad extractor. +/// +/// The callback may be invoked 0..n times for each range submitted. Processing +/// is happening asynchronously in the background: there is no guarantee that +/// the return pads passed to the visitor at each call correspond to the range +/// that was just submitted. +/// +/// The user_data pointer is passed to the visitor untouched and may be `NULL`. +/// +/// Once all ranges have been submitted, call this function with a `NULL` range +/// once to indicate this to force all remaining buffered return pads to be +/// flushed. +#[no_mangle] +pub unsafe extern "C" fn symblib_retpadextr_submit( + extr: *mut SymblibRetPadExtractor, + range: *const SymblibRange, + visitor: RetPadVisitor, + user_data: *mut c_void, +) -> StatusCode { + match retpadextr_submit_impl(extr, range, visitor, user_data) { + Ok(()) => StatusCode::Ok, + Err(e) => e, + } +} + +unsafe fn retpadextr_submit_impl( + extr: *mut SymblibRetPadExtractor, + range: *const SymblibRange, + visitor: RetPadVisitor, + user_data: *mut c_void, +) -> FfiResult { + assert!(!extr.is_null()); + let extr: &mut SymblibRetPadExtractor = &mut *extr; + + // Wrap visitor to make it rustier. + let visitor = |rng: SymblibReturnPad| -> FfiResult { + FfiResult::from(unsafe { visitor(user_data, &rng) }) + }; + + // Communicate with the worker. + if range.is_null() { + // Null range indicates end of ranges: drop our range TX to notify the + // worker thread that we're done here. + drop(extr.range_tx.take()); + + // Blockingly read back results until the thread drops the channel. + extr.ret_pad_rx.iter().try_for_each(visitor)?; + + // Wait for thread to exit and retrieve the result. + extr.thread_handle + .take() + .map(|x| x.join().unwrap(/* forward panic */)) + .transpose()?; + } else { + let Some(range_tx) = &extr.range_tx else { + return Err(StatusCode::AlreadyClosed); + }; + + let mut range = symbfile::Range::clone(&(*range).rust_range); + while let Err(e) = range_tx.try_send(range) { + match e { + TrySendError::Disconnected(_) => { + // TODO: can this even happen? + return Err(StatusCode::AlreadyClosed); + } + TrySendError::Full(returned) => { + // TX channel is clogged. Read back items from the output + // channel until the worker made progress. + extr.ret_pad_rx.try_iter().try_for_each(visitor)?; + std::thread::yield_now(); + range = returned; + } + } + } + + // Read as much as we can without blocking. + extr.ret_pad_rx.try_iter().try_for_each(visitor)?; + } + + Ok(()) +} + +/// Frees a return pad extractor. +#[no_mangle] +pub unsafe extern "C" fn symblib_retpadextr_free(extr: *mut SymblibRetPadExtractor) { + let extr = Box::from_raw(extr); + if let Some((handle, rx)) = extr.thread_handle.zip(extr.range_tx) { + drop(rx); + handle.join().unwrap(/* forward panic */).ok(); + } +} + +/// Handle to a return pad extractor background thread. +/// +/// Opaque to C. +#[repr(C)] +pub struct SymblibRetPadExtractor { + thread_handle: Option>, + range_tx: Option>, + ret_pad_rx: mpsc::Receiver, +} + +/// FFI-safe variant of [`symbfile::ReturnPad`]. +#[repr(C)] +#[derive(Debug)] +pub struct SymblibReturnPad { + pub elf_va: VirtAddr, + pub entries: SymblibSlice, +} + +impl From for SymblibReturnPad { + fn from(pad: symbfile::ReturnPad) -> Self { + let entries: Vec = pad.entries.into_iter().map(Into::into).collect(); + + Self { + elf_va: pad.elf_va, + entries: entries.into(), + } + } +} + +/// FFI-safe variant of [`symbfile::ReturnPadEntry`]. +#[repr(C)] +#[derive(Debug)] +pub struct SymblibReturnPadEntry { + pub func: SymblibString, // never null + pub file: SymblibString, // may be null + pub line: u32, // 0 = unknown +} + +impl From for SymblibReturnPadEntry { + fn from(entry: symbfile::ReturnPadEntry) -> Self { + Self { + func: entry.func.into(), + file: entry.file.into(), + line: entry.line.unwrap_or(0), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::*; + use std::ptr; + + extern "C" fn retpad_visitor( + user_data: *mut c_void, + rng: *const SymblibReturnPad, + ) -> StatusCode { + assert!(user_data.is_null()); + assert!(!rng.is_null()); + dbg!(unsafe { &*rng }); + StatusCode::Ok + } + + unsafe extern "C" fn rng_visitor( + user_data: *mut c_void, + rng: *const SymblibRange, + ) -> StatusCode { + let extr = user_data as *mut SymblibRetPadExtractor; + symblib_retpadextr_submit(extr, rng, retpad_visitor, ptr::null_mut()) + } + + #[test] + fn rng_retpad_extr_integration() { + let file = c"../symblib/testdata/inline"; + + let mut extr = ptr::null_mut(); + let mut status = unsafe { symblib_retpadextr_new(file.as_ptr(), &mut extr) }; + assert_eq!(status, StatusCode::Ok); + + status = unsafe { symblib_rangeextr(file.as_ptr(), false, rng_visitor, extr as _) }; + assert_eq!(status, StatusCode::Ok); + + let status = unsafe { + symblib_retpadextr_submit(extr, ptr::null(), retpad_visitor, ptr::null_mut()) + }; + assert_eq!(status, StatusCode::Ok); + + unsafe { + assert!((*extr).thread_handle.is_none()); + assert!((*extr).range_tx.is_none()); + assert_eq!((*extr).ret_pad_rx.iter().count(), 0); + + symblib_retpadextr_free(extr) + } + } +} diff --git a/rust-crates/symblib-capi/src/status.rs b/rust-crates/symblib-capi/src/status.rs new file mode 100644 index 00000000..54686aa2 --- /dev/null +++ b/rust-crates/symblib-capi/src/status.rs @@ -0,0 +1,119 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Defines FFI error codes and their conversion from Rust error types. + +use std::io; +use symblib::{dwarf, objfile, retpads, symbconv}; + +pub type FfiResult = Result; + +/// Error codes exposed to the C API. +/// +/// The errors that we are exposing are currently rather coarsely mapped. +/// In the future, it probably makes sense to expose sub-errors more granularly. +#[repr(C)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, thiserror::Error)] +#[non_exhaustive] +pub enum StatusCode { + #[error("OK: not actually an error")] + Ok = 0, + + #[error("IO error")] + IoMisc = 1, + + #[error("IO error: file not found")] + IoFileNotFound = 2, + + #[error("Object file reading error")] + Objfile = 3, + + #[error("DWARF reading error")] + Dwarf = 4, + + #[error("Symbol conversion error")] + Symbconv = 5, + + #[error("Return pad extraction error")] + Retpad = 6, + + #[error("Invalid UTF-8")] + BadUtf8 = 7, + + #[error("The channel was already closed in a previous call")] + AlreadyClosed = 8, +} + +impl From for FfiResult { + fn from(code: StatusCode) -> Self { + if code == StatusCode::Ok { + Ok(()) + } else { + Err(code) + } + } +} + +impl From for StatusCode { + fn from(result: FfiResult) -> Self { + match result { + Ok(()) => StatusCode::Ok, + Err(e) => e, + } + } +} + +impl From for StatusCode { + fn from(e: io::Error) -> Self { + if e.kind() == io::ErrorKind::NotFound { + StatusCode::IoFileNotFound + } else { + StatusCode::IoMisc + } + } +} + +impl From for StatusCode { + fn from(e: objfile::Error) -> Self { + match e { + objfile::Error::IO(io) => io.into(), + _ => Self::Objfile, + } + } +} + +impl From for StatusCode { + fn from(e: dwarf::Error) -> Self { + match e { + dwarf::Error::Objfile(x) => x.into(), + _ => Self::Dwarf, + } + } +} + +impl From for StatusCode { + fn from(e: symbconv::Error) -> Self { + match e { + symbconv::Error::Objfile(x) => x.into(), + _ => Self::Symbconv, + } + } +} + +impl From for StatusCode { + fn from(e: symbconv::multi::Error) -> Self { + symbconv::Error::Multi(e).into() + } +} + +impl From for StatusCode { + fn from(_: std::str::Utf8Error) -> Self { + Self::BadUtf8 + } +} + +impl From for StatusCode { + fn from(_: retpads::Error) -> Self { + Self::Retpad + } +} diff --git a/rust-crates/symblib/Cargo.toml b/rust-crates/symblib/Cargo.toml new file mode 100644 index 00000000..426d2bda --- /dev/null +++ b/rust-crates/symblib/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "symblib" +edition = "2021" +version.workspace = true +rust-version.workspace = true + +[dependencies] +anyhow.workspace = true +base64.workspace = true +cpp_demangle.workspace = true +fallible-iterator.workspace = true +flate2.workspace = true +gimli.workspace = true +intervaltree.workspace = true +lru.workspace = true +memmap2.workspace = true +object.workspace = true +prost.workspace = true +rustc-demangle.workspace = true +sha2.workspace = true +smallvec.workspace = true +tempfile.workspace = true +thiserror.workspace = true +zstd.workspace = true +zydis.workspace = true + +[build-dependencies] +prost-build.workspace = true diff --git a/rust-crates/symblib/README.md b/rust-crates/symblib/README.md new file mode 100644 index 00000000..9b0f4214 --- /dev/null +++ b/rust-crates/symblib/README.md @@ -0,0 +1,162 @@ +symblib +======= + +`symblib` is our internal library whose purpose is to parse executables or +shared libraries containing debug information and to turn them into our +simplified symbfile symbol storage format. + +## Symbol sources + +libpf currently supports the following symbols sources: + +| Source | Function names | Source and line info | Inline info | +|----------------------|----------------|----------------------|-------------| +| ELF dynamic symbols | yes | not present | not present | +| ELF static symbols | yes | not present | not present | +| DWARF | yes | yes | yes | +| Go runtime info | yes | partial[^1] | partial[^1] | + +[^1]: Extraction supported by our `symblib::gosym` module, but + `symblib::symbconv::go` doesn't support translating them to our symbfile + range format yet. + +## Design philososphy + +Debug symbol formats have a tendency to be very complex. At the same time, they +often force a reader to sift through large amounts of information that is not +relevant for symbolization. + +### Debug symbol format parsing abstractions + +To keep complexity reasonable, we write abstractions for each such complex format +that allow us to access the relevant portion of this data efficiently. For example, +the [`dwarf`] module provides a wrapper around the excellent but rather low-level +[`gimli`] DWARF parsing library that massively simplifies iterating over the symbols +that DWARF contains. + +These abstractions all have the following qualities and design goals: + +- **Use zero-copy parsing as much as possible**\ + Not only is this desirable for performance, but it also allows our memory usage + to largely be independent of the executable size that we are parsing. It's not + at all uncommon to see 5+ GiB debug build executables these days, and we don't + want our symbolization tooling memory usage to scale linearly with executable + size. +- **If allocations are necessary, ensure that they are bounded**\ + Sometimes it is necessary to maintain cache data structures to enable efficient + parsing. If this is the case, make sure not to trust the executable and ensure + that every allocation based on sizes from the executable are checked against + upper bounds. We don't want a broken executable that claims to have MAX_INT + sections to send us into OOM. +- **If large allocations are necessary, make them using `mmap`ed temp files**\ + Sometimes it's necessary to load executable regions into memory. For example, + DWARF sections can be compressed in the executable, so we can't just parse them + from an `mmap`ed executable directly. In these cases, we decompress the data + into a temporary file and mmap it. This effectively has the performance of a + regular allocation on a machine with enough memory while at the same time + behaving gracefully on low-memory machines by simply swapping back and forth + from disk where necessary. + +Examples of such abstractions: + +- The [`dwarf`] module abstracts over DWARF internals, exposing lazy, zero-copy + symbol iterators. +- The [`objfile`] module exposes transparent decompression and relocation of + executable sections via `mmap`ed temporary files. +- The [`gosym`] module implements parsing for Go's internal runtime structures + and exposes them through lazy, zero-copy iterators. + +### Debugging the debug format parsers + +The debug information emitted by modern compilers is not nearly as well-tested +and maintained as the code that they are genearting. It's not at all uncommon to +have at least partially broken debug info. The formats are also often very +complex and it's easy to accidentally do incorrect parsing in our code. + +To simplify debugging and investigating such problems, we tend to have an internal +debugging sub-command for each symbol format abstraction in `symbtool`. For +example, the `dwarf` abstration has a corresponding `dwarf` sub-command that in +turn has a `dump` sub-command that prints (and optionally filters) all info that +the format abstraction exposes in a format fit for human consumption. + +### Range conversion + +All debug symbol formats have their own and often slightly different idea of how +to represent line tables, function name mappings and inline function trees. To +unify the representations, we need to convert these internal representations into +our symbfile format. + +For this purpose, we have the [`symbconv::RangeExtractor`] trait and one type +that implements it for each debug symbol format that we support. For example, +the `RangeExtractor` implementation for DWARF symbols is implemented on the +[`symbconv::dwarf::Extractor`] type. + +We further have one range extractor implementation that allows merging the output +of multiple other range extractors based on priority and coverage maps: +[`symbconv::multi::Extractor`]. + +### Return pads + +#### Motivation + +In a perfect world, the range based symbols would be the only data that we need +for symbolization. However, in practice we have realized that the `_msearch` +queries that are necessary to make the symbol lookups would most likely be too +slow if we did them for every single frame that we want to symbolize. + +We then made the following observations: + +- in every stack trace, all[^all] frames except for the last one will always be a + return addresses thats follow a call +- only a small fraction of instructions in a typical executables are calls + +Based on these observations, we came up with the idea to: + +- let symbtool search all call instructions in the executable +- generate a return pad record with the complete inline-trace for each return + address that follows the call +- insert these return pads eagerly when the executable is first seen, avoiding + the need for `_msearch` range queries for 95%+ of frames that we encounter, + instead allowing us to do much faster `_mget` point lookups + +[^all]: Not actually entirely accurate: there are some exceptions like signal + frames, but they are relatively rare and we currently don't handle them + correctly. + +#### Generation + +To generate return pads, we need the following information: + +- Range-based symbols as generated by [`symbconv::RangeExtractor`] +- The executable including the code sections. This may sound obvious, but in some + cases such as split DWARF the executable is split into two files: one with an + all-zero dummy code section and the debug info (to be stored for debugging) + and one with the code but without debug info (to be deployed into production). +- Support for the object file format (e.g. ELF) and finding the code sections + within them +- A [`disas::InstrDecoder`] implementation for the executable's architecture + +This generally means that as long as the debug info format that you want to +support is somehow based on regular, native ELF executables on Linux, **you +do not have to worry about this and implementing range extraction should be +sufficient to also get return pad extraction support for free**. + +## Error handling design + +`symblib` uses strong-typed error handling. Each major sub-module defines their +own `Error` and `Result` types. These types are usually error `enums` that +explicitly list most common problems that can occurr during usage and implement +the [`std::error::Error`] trait via the macros in the [`thiserror`] crate. The +idea here is to allow library users to detect and specifically handle particular +errors. Debug information in real-world executables is often partially broken, +so there's a lot of value to be able to still continue parsing for non-critical +errors. + +In some cases where an error is produced by a third party library that we +abstract over, for example by the `object` library used in [`objfile`], errors +that we don't specifically care about are type erased into [`AnyError`] +(`Box`) to avoid excessive error translation logic or leaking +`object` types from the abstraction. + +[`thiserror`]: https://docs.rs/thiserror/latest/thiserror +[`std::error::Error`]: https://doc.rust-lang.org/std/error/trait.Error.html diff --git a/rust-crates/symblib/build.rs b/rust-crates/symblib/build.rs new file mode 100644 index 00000000..9e540822 --- /dev/null +++ b/rust-crates/symblib/build.rs @@ -0,0 +1,12 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +static PROTO: &str = "../symb-proto/symbfile.proto"; + +fn main() -> Result<(), Box> { + println!("cargo:rerun-if-changed={PROTO}"); + Ok(prost_build::compile_protos( + &[PROTO], + &["../symb-proto"], + )?) +} diff --git a/rust-crates/symblib/src/covmap.rs b/rust-crates/symblib/src/covmap.rs new file mode 100644 index 00000000..7e9e1ea7 --- /dev/null +++ b/rust-crates/symblib/src/covmap.rs @@ -0,0 +1,267 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal coverage bitmap implementation. + +use crate::{range_overlap, VirtAddr}; +use std::num::NonZeroU64; +use std::{io, ops}; + +/// Coverage tracker for VA ranges. +#[derive(Debug)] +pub struct CovMap { + range: ops::Range, + scale: NonZeroU64, + bit_vec: Vec, +} + +impl CovMap { + /// Allocate a new coverage map for the given address range. + pub fn with_buckets(max_buckets: NonZeroU64, range: ops::Range) -> Self { + let len = range.end.saturating_sub(range.start); + let scale = (len / max_buckets.get()).max(1); + Self::with_scale(NonZeroU64::new(scale).unwrap(), range) + } + + /// Allocate a new coverage map with the given granularity (scale) in bytes. + pub fn with_scale(scale: NonZeroU64, range: ops::Range) -> Self { + let len = range.end.saturating_sub(range.start); + let bit_scale = scale.get() * 8; + let vec_len = len.div_ceil(bit_scale); + + Self { + range, + scale, + bit_vec: vec![0; vec_len as usize], + } + } + + /// Returns the range covered by this map. + pub fn map_range(&self) -> ops::Range { + self.range.clone() + } + + /// Mark the given range as covered. + /// + /// Updates to addresses outside the map's range are discarded. + pub fn add_range(&mut self, rng: ops::Range) { + let Some(bit_indices) = self.bit_indices_for_va_range(rng) else { + return; + }; + + for offset in bit_indices { + let byte = offset / 8; + let bit = offset % 8; + self.bit_vec[byte] |= 1 << bit; + } + } + + fn bit_indices_for_va_range(&self, rng: ops::Range) -> Option> { + let Some(mut overlap) = range_overlap(&self.range, &rng) else { + return None; + }; + + // Rebase to coverage map range. + overlap.start -= self.range.start; + overlap.end -= self.range.start; + + // Reduce resolution to scale. + overlap.start /= self.scale.get(); + overlap.end = overlap.end.div_ceil(self.scale.get()); + + Some(overlap.start as usize..overlap.end as usize) + } + + /// Checks whether the given range is at least partially covered. + pub fn range_partially_covered(&self, rng: ops::Range) -> bool { + let Some(bit_indices) = self.bit_indices_for_va_range(rng) else { + return false; + }; + + for offset in bit_indices { + let byte_offs = offset / 8; + let bit_offs = offset % 8; + let byte = self.bit_vec[byte_offs]; + if byte & (1 << bit_offs) != 0 { + return true; + } + } + + false + } + + /// Prints a coverage map to the given output stream. + /// + /// Uses unicode braille characters for increased compactness. + pub fn print_table(&self, mut out: impl io::Write) -> io::Result<()> { + const CHARS_PER_LINE: usize = 80; + + writeln!(out, "Address ┃ Coverage")?; + writeln!(out, "━━━━━━━━━━━╋━{}", "━".repeat(CHARS_PER_LINE))?; + + for (chunk, i) in self.bit_vec.chunks(CHARS_PER_LINE).zip(0u64..) { + let addr = self.range.start + i * 8 * self.scale.get() * CHARS_PER_LINE as u64; + + write!(out, "0x{:08x} ┃ ", addr)?; + + for block in chunk { + // Unicode braille characters are constructed by adding an u8 + // where each bit corresponds to one of the 8 braille dots to + // the char-code of the first braille char ('\u{2800}'). + let char_code = 0x2800u32 + *block as u32; + write!(out, "{}", char::from_u32(char_code).unwrap())?; + } + + writeln!(out)?; + } + + Ok(()) + } +} + +/// Error indicating that two segments overlap (not allowed). +#[derive(Debug, thiserror::Error)] +#[error("segments have overlap in range {0:?}")] +pub struct SegmentOverlapError(ops::Range); + +macro_rules! impl_map_for_addr { + ( $this:ident, $va:ident $(, $maybe_mut:tt)? ) => {{ + // Fast path for 0..1 inner maps. + match $this.maps.len() { + 0 => return None, + 1 if $this.maps[0].map_range().contains(&$va) => + return Some(&$($maybe_mut)* $this.maps[0]), + 1 => return None, + _ => { /* continue below */ } + } + + // More than one map: bsearch. + match $this.maps.binary_search_by_key(&$va, |x| x.range.start) { + // Exact match. + Ok(idx) => Some(&$($maybe_mut)* $this.maps[idx]), + + // Inner map array is empty. + Err(0) => None, + + // Either found somewhere within a map or outside valid range. + Err(idx) => $this.maps[idx - 1] + .map_range() + .contains(&$va) + .then_some(&$($maybe_mut)* $this.maps[idx - 1]), + } + }}; +} + +/// Coverage tracker for multiple non-overlapping VA ranges. +#[derive(Debug, Default)] +pub struct SegmentedCovMap { + /// Inner maps ordered by start VA. Cannot overlap. + maps: Vec, +} + +impl SegmentedCovMap { + /// Create an empty segmented coverage map. + pub fn new() -> Self { + Self::default() + } + + /// Adds a new segment to the map. + pub fn add_segment(&mut self, new: CovMap) -> Result<&mut Self, SegmentOverlapError> { + for seg in &self.maps { + if let Some(overlap) = range_overlap(&seg.map_range(), &new.map_range()) { + return Err(SegmentOverlapError(overlap)); + } + } + + self.maps.push(new); + + Ok(self) + } + + /// Mark the given range as covered. + /// + /// The range is assigned to the segment containing `rng.start`. If the + /// range spans more than one segment, the portion of the range that doesn't + /// overlap with the range of the initial segment is discarded. This + /// limitation is imposed to simplify the implementation and could be lifted + /// later if necessary. + pub fn add_range(&mut self, rng: ops::Range) { + let Some(seg) = self.map_for_addr_mut(rng.start) else { + return; + }; + + seg.add_range(rng) + } + + /// Locates the inner map containing the given VA (mutable). + fn map_for_addr_mut(&mut self, va: VirtAddr) -> Option<&mut CovMap> { + impl_map_for_addr!(self, va, mut) + } + + /// Locates the inner map containing the given VA. + fn map_for_addr(&self, va: VirtAddr) -> Option<&CovMap> { + impl_map_for_addr!(self, va) + } + + /// Checks whether the given range is at least partially covered. + /// + /// Current implementation requires the range start to be contained in the + /// map and won't assign coverage to more than one segment. If you submit a + /// range that starts in segment A and then proceeds into segment B, + /// coverage will only be assigned to segment A. This limitation is imposed + /// to simplify the implementation and could be lifted later if necessary. + pub fn range_partially_covered(&self, rng: ops::Range) -> bool { + let Some(map) = self.map_for_addr(rng.start) else { + return false; + }; + + map.range_partially_covered(rng) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn partial_cov_at_scale_1() { + let scale = NonZeroU64::new(1).unwrap(); + let mut map = CovMap::with_scale(scale, 0x100..0x200); + + map.add_range(0x090..0x110); // A + map.add_range(0x1A0..0x1E0); // B + map.add_range(0x1A2..0x1EF); // C + map.add_range(0x152..0x191); // D + + assert!( + !map.range_partially_covered(0x92..0x94), + "outside of map range and should not be included", + ); + assert!( + !map.range_partially_covered(0x80..0x100), + "outside of map range and should not be included", + ); + + assert!(map.range_partially_covered(0x100..0x101), "inside A"); + assert!(map.range_partially_covered(0x90..0x101), "overlaps A"); + assert!(map.range_partially_covered(0x90..0x101), "overlaps A"); + + for s in 0x1A0..0x1EF { + assert!( + !map.range_partially_covered(s..s), + "empty range doesn't cover anything", + ); + + for l in 1..10 { + assert!( + map.range_partially_covered(s..s + l), + "covered by either B or C", + ); + } + } + + assert!(!map.range_partially_covered(0x110..0x111), "just after A"); + assert!(!map.range_partially_covered(0x1EF..0x1F0), "just after C"); + assert!(!map.range_partially_covered(0x191..0x192), "just after D"); + } +} diff --git a/rust-crates/symblib/src/dbglog.rs b/rust-crates/symblib/src/dbglog.rs new file mode 100644 index 00000000..cb5aca53 --- /dev/null +++ b/rust-crates/symblib/src/dbglog.rs @@ -0,0 +1,28 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal debug logging support. +//! +//! If we end up needing more elaborate logging later, it is worth considering +//! switching to the `log` crate and a corresponding subscriber. However, for +//! our current needs this seemed overkill. + +use std::sync::atomic::AtomicBool; + +// Re-export to make the macro show up in this module in rustdoc. +pub use crate::debug; + +/// Determines whether [`debug`] messages are actually printed or not. +pub static ENABLED: AtomicBool = AtomicBool::new(false); + +/// Print to stderr if debug printing is enabled. +/// +/// See [`eprintln`] documentation for usage. +#[macro_export] +macro_rules! debug { + ( $($args:tt)* ) => { + if $crate::dbglog::ENABLED.load(::std::sync::atomic::Ordering::Relaxed) { + ::std::eprintln!( $($args)* ); + } + } +} diff --git a/rust-crates/symblib/src/demangle.rs b/rust-crates/symblib/src/demangle.rs new file mode 100644 index 00000000..13cf0642 --- /dev/null +++ b/rust-crates/symblib/src/demangle.rs @@ -0,0 +1,271 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Cross-language symbol demangling. + +use smallvec::SmallVec; +use std::borrow::Cow; + +/// Strips disambiguation suffixes commonly appended to function clones. +/// +/// Modern compilers frequently create specialized versions of functions that +/// factor in additional information from a call site, e.g. arguments that are +/// constants. The corresponding optimization passes append dot-prefixed suffixes +/// like `.isra.0` to the function name to disambiguate them from the regular +/// function instance. +/// +/// binutils' demangler[1] will simply consider anything after a `.` to be a +/// clone suffix: this works because they only demangle Rust and C++ which +/// both won't otherwise ever have single dots within their name (only double- +/// dots). However, we also care about Go which DOES have regular dots but +/// otherwise doesn't have any prefix that allows clear separation from C +/// symbols with a clone suffix (e.g. `runtime.saveg`). We thus need to keep +/// a white-list of specific known clone suffixes instead, which might turn +/// out to be a bit of a maintenance burden. +/// +/// [1]: https://github.com/bminor/binutils-gdb/blob/978042640c/libiberty/cp-demangle.c#L4043 +/// +/// The following shell command can be used to get a list of possible suffixes: +/// +/// ```bash +/// llvm-readelf --syms ~devel/libxul.so.dbg | awk '{ print $8 }' | \ +// rg --passthru -F '..' -r 'DOTDOT' | rg -F . | \ +// cut -d . -f 2- | tr '.' '\n' | sort | uniq -c | sort -nr +/// ``` +fn strip_clone_suffixes(mut name: &str) -> &str { + // Strip suffixes like ".llvm.9420829416740162726", ".constprop.0", etc. + for suffix in &[".clone.", ".constprop.", ".llvm.", ".isra.", ".part."] { + if let Some(pos) = name.rfind(suffix) { + if name[pos + suffix.len()..] + .chars() + .take_while(|&x| x != '.') + .all(|x| x.is_ascii_digit()) + { + name = &name[..pos]; + } + } + } + + // Strip ".cold" suffix. + if let Some(stripped) = name.strip_suffix(".cold") { + name = stripped; + } + + name +} + +fn could_be_rust_symbol(name: &str) -> bool { + // V0 mangling. + if name.starts_with("_R") { + return true; + } + + // Legacy mangling: _ZN.*17h[a-zA-Z0-9]{16}E + if name.starts_with("_ZN") + && name.ends_with('E') + && name.len() > 3 + 3 + 16 + 1 + && &name[name.len() - 3 - 16 - 1..][..3] == "17h" + && name[name.len() - 16 - 1..][..16] + .chars() + .all(|x| x.is_ascii_hexdigit()) + { + return true; + } + + false +} + +fn could_be_itanium_abi_cxx_symbol(name: &str) -> bool { + // With the exception of MSVC, this is the C++ mangling format emitted + // by essentially all modern C++ compilers. + // + // https://clang.llvm.org/doxygen/ItaniumMangle_8cpp_source.html + // https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling + name.starts_with("_Z") || name.starts_with("___Z") +} + +/// C++ name formatter dropping template arguments. +#[derive(Debug, Default)] +struct CxxFormatter { + buf: String, + stack: SmallVec<[cpp_demangle::DemangleNodeType; 32]>, +} + +impl CxxFormatter { + pub fn finalize(self) -> String { + debug_assert!(self.stack.is_empty()); + self.buf + } +} + +impl cpp_demangle::DemangleWrite for CxxFormatter { + fn push_demangle_node(&mut self, ty: cpp_demangle::DemangleNodeType) { + self.stack.push(ty); + } + + fn write_string(&mut self, s: &str) -> std::fmt::Result { + if matches!(s, "<" | ">") { + return Ok(()); + } + + use cpp_demangle::DemangleNodeType::TemplateArgs; + if self.stack.iter().any(|&x| x == TemplateArgs) { + return Ok(()); + } + + self.buf.push_str(s); + + Ok(()) + } + + fn pop_demangle_node(&mut self) { + self.stack.pop(); + } +} + +/// Demangles the given symbol name. +pub fn demangle(mut name: &str) -> Cow<'_, str> { + name = strip_clone_suffixes(name); + + if could_be_rust_symbol(name) { + if let Ok(demangler) = rustc_demangle::try_demangle(name) { + // The alternate formatting using `#` suppresses the hash suffix. + return Cow::Owned(format!("{:#}", demangler)); + }; + } + + if could_be_itanium_abi_cxx_symbol(name) { + if let Ok(sym) = cpp_demangle::BorrowedSymbol::new(name.as_bytes()) { + let mut formatter = CxxFormatter::default(); + let options = cpp_demangle::DemangleOptions::default(); + if let Ok(()) = sym.structured_demangle(&mut formatter, &options) { + return Cow::Owned(formatter.finalize()); + } + } + } + + Cow::Borrowed(name) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn c() { + assert_eq!(demangle("blah.cold"), "blah"); + assert_eq!(demangle("blah.constprop.0.cold"), "blah"); + assert_eq!(demangle("blah"), "blah"); + assert_eq!( + demangle("_RustIsNotTheOnlyLangWhoseSymbolsCanStartWith_R"), + "_RustIsNotTheOnlyLangWhoseSymbolsCanStartWith_R", + ); + } + + #[test] + fn cxx() { + let mangled = concat!( + "_ZNSt3__111__introsortINS_12__debug_lessINS_6__lessIN14arrow_vendored4date9", + "time_zoneES5_EEEEPS5_EEvT0_S9_T_NS_15iterator_traitsIS9_E15difference_typeE", + ); + let demangled = concat!( + "void std::__1::__introsort(arrow_vendored::date::time_zone*, ", + "arrow_vendored::date::time_zone*, std::__1::__debug_less, ", + "std::__1::iterator_traits::difference_type)", + ); + assert_eq!(demangle(mangled), demangled); + + let mangled = concat!( + "_ZN7mozilla3dom17Selection_BindingL8get_typeEP9JSContext", + "N2JS6HandleIP8JSObjectEEPv19JSJitGetterCallArgs.cold", + ); + let demangled = concat!( + "mozilla::dom::Selection_Binding::get_type(", + "JSContext*, JS::Handle, void*, JSJitGetterCallArgs)", + ); + assert_eq!(demangle(mangled), demangled); + + let mangled = concat!( + "_ZN5media13MojoDecryptor21DecryptAndDecodeVideoE13scoped_refptrINS_13Decoder", + "BufferEERKN4base17RepeatingCallbackIFvNS_9Decryptor6StatusES1_INS_10VideoFrameEEEEE", + ); + let demangled = concat!( + "media::MojoDecryptor::DecryptAndDecodeVideo(", + "scoped_refptr, base::RepeatingCallback const&)", + ); + assert_eq!(demangle(mangled), demangled,); + + let mangled = concat!( + "_ZN2js8HeapSlot4postEPNS_12NativeObjectENS0_", + "4KindEjRKN2JS5ValueE.isra.0.cold", + ); + let demangled = concat!( + "js::HeapSlot::post(js::NativeObject*, js::HeapSlot::Kind, ", + "unsigned int, JS::Value const&)" + ); + assert_eq!(demangle(mangled), demangled); + } + + #[test] + fn rust() { + let mangled = concat!( + "_ZN50_$LT$$RF$mut$u20$W$u20$as$u20$core..fmt..Write", + "$GT$10write_char17h40d2a72f9527ade5E.llvm.5999636307758439825", + ); + assert_eq!( + demangle(mangled), + "<&mut W as core::fmt::Write>::write_char", + ); + + let mangled = concat!( + "_ZN71_$LT$rustc_demangle..legacy..Demangle$u20", + "$as$u20$core..fmt..Display$GT$3fmt17h48ee277748f854a8E", + ); + assert_eq!( + demangle(mangled), + "::fmt", + ); + } + + #[test] + fn go() { + let names = &[ + "github.com/googleapis/gnostic/openapiv2..stmp_173", + "runtime.(*mheap).grow", + "runtime.cmpstring", + "type..eq.k8s.io/api/core/v1.NodeSystemInfo", + "go.opemtelemetry.io/ebpf-profiler/libpf.Max[go.shape.uint32_0]", + concat!( + "go.opentelemetry.io/ebpf-profiler/libpf.MapKeysToSlice", + "[go.shape.uint32_0,go.shape.struct {}_1]" + ), + concat!( + r#"type..eq.struct { APIVersion string "json:\"apiVersion,"#, + r#"omitempty\""; Kind string "json:\"kind,omitempty\"" }"#, + ), + concat!( + "go.opentelemetry.io/ebpf-profiler/libpf/xsync.(*RWMutex[go.shape.struct ", + "{ go.opentelemetry.io/ebpf-profiler/processmanager/", + "execinfomanager.interpreterLoaders []go.opentelemetry.io/ebpf-profiler/", + "interpreter.Loader; go.opentelemetry.io/ebpf-profiler/", + "processmanager/execinfomanager.ebpf go.opentelemetry.io/ebpf-profiler/", + "processmanager/ebpf.EbpfHandler; ", + "go.opentelemetry.io/ebpf-profiler/processmanager/", + "execinfomanager.reporter go.opentelemetry.io/ebpf-profiler/", + "interpreter.ReportFrameMetadataFunc; go.opentelemetry.io/ebpf-profiler/", + "processmanager/execinfomanager.executables map[go.opentelemetry.io/", + "ebpf-profiler/host.FileID]*go.opentelemetry.io/ebpf-profiler/", + "processmanager/execinfomanager.entry; go.opentelemetry.io/ebpf-profiler/", + "processmanager/execinfomanager.unwindInfoIndex ", + "map[go.opentelemetry.io/ebpf-profiler/libpf/nativeunwind/stackdeltatypes.UnwindInfo", + "]uint16; go.opentelemetry.io/ebpf-profiler/processmanager/", + "execinfomanager.numStackDeltaMapPages uint64 }_0]).WUnlock" + ), + ]; + + // Make sure that Go symbols are passed through untouched + for &name in names { + assert_eq!(demangle(name), name); + } + } +} diff --git a/rust-crates/symblib/src/disas.rs b/rust-crates/symblib/src/disas.rs new file mode 100644 index 00000000..2e0421aa --- /dev/null +++ b/rust-crates/symblib/src/disas.rs @@ -0,0 +1,242 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal instruction disassembler implementation. + +use crate::{AnyError, VirtAddr}; +use fallible_iterator::FallibleIterator; +use std::iter; + +/// Errors that can occur during instruction decoding. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Not enough bytes to decode the full instruction at {:#08X}", .0)] + TruncatedInstruction(VirtAddr), + + #[error("Bytes at {:#08X} do not form a valid instruction", .0)] + InvalidInstruction(VirtAddr), + + #[error(transparent)] + Other(AnyError), +} + +/// Trait for instruction decoders. +pub trait InstrDecoder { + /// Decode one instruction and return information. + fn decode(&self, addr: VirtAddr, buf: &[u8]) -> Result; +} + +/// Information about an instruction. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct InstrInfo { + /// Virtual addresss of the instruction. + pub addr: VirtAddr, + + /// Whether the instruction is a call or syscall. + pub is_call: bool, + + /// Length of the instruction, in bytes. + pub length: u8, +} + +/// AMD64 (X86-64) instruction decoder. +/// +/// Currently implemented using the Zydis library. +#[derive(Debug)] +pub struct Amd64InstrDecoder(zydis::Decoder); + +impl Default for Amd64InstrDecoder { + fn default() -> Self { + Self(zydis::Decoder::new64()) + } +} + +impl InstrDecoder for Amd64InstrDecoder { + fn decode(&self, addr: VirtAddr, buf: &[u8]) -> Result { + use zydis::{Mnemonic as M, ZYAN_MODULE_ZYDIS}; + + let insn = match self.0.decode_first::(buf) { + Ok(Some(insn)) => insn, + Ok(None) => return Err(Error::TruncatedInstruction(addr)), + Err(e) if e.module() == ZYAN_MODULE_ZYDIS => { + return Err(Error::InvalidInstruction(addr)) + } + Err(e) => return Err(Error::Other(Box::new(e))), + }; + + Ok(InstrInfo { + addr, + is_call: matches!(insn.mnemonic, M::CALL | M::SYSCALL | M::INT | M::INTO), + length: insn.length, + }) + } +} + +/// ARM64 (aarch64) instruction decoder. +/// +/// Currently a hand-rolled minimal decoder. +#[derive(Debug, Default)] +pub struct Aarch64InstrDecoder; + +impl InstrDecoder for Aarch64InstrDecoder { + fn decode(&self, addr: VirtAddr, buf: &[u8]) -> Result { + // For the few instructions that we care about, it's simple enough to + // do the decoding ourselves, so that's what we're doing here. + + const BL: u32 = 0b10010100000000000000000000000000; + const BL_MASK: u32 = 0b11111100000000000000000000000000; + const BLR: u32 = 0b11010110001111110000000000000000; + const BLR_MASK: u32 = 0b11111111111111111111110000011111; + const SVC: u32 = 0b11010100000000000000000000000001; + const SVC_MASK: u32 = 0b11111111111000000000000000011111; + + if buf.len() < 4 { + return Err(Error::TruncatedInstruction(addr)); + } + + let insn = u32::from_le_bytes(buf[..4].try_into().unwrap()); + let is_bl = (insn & BL_MASK) == BL; + let is_blr = (insn & BLR_MASK) == BLR; + let is_svc = (insn & SVC_MASK) == SVC; + + Ok(InstrInfo { + addr, + is_call: is_bl || is_blr || is_svc, + length: 4, + }) + } +} + +/// Creates an iterator decoding all instructions in the given buffer. +pub fn decode_all<'a, D: InstrDecoder + ?Sized + 'a>( + decoder: &'a D, + mut addr: VirtAddr, + mut buf: &'a [u8], +) -> impl FallibleIterator + 'a { + fallible_iterator::convert(iter::from_fn(move || { + if buf.is_empty() { + return None; + } + + let result = decoder.decode(addr, buf); + + if let Ok(insn) = &result { + buf = &buf[insn.length as usize..]; + addr += insn.length as VirtAddr; + } + + Some(result) + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dec_all() { + let dec = Amd64InstrDecoder::default(); + let mut iter = decode_all(&dec, 0x10, b"\xCC\x90\xE8\x11\x22\x33\x44"); + assert_eq!( + iter.next().unwrap(), + Some(InstrInfo { + is_call: false, + length: 1, + addr: 0x10, + }) + ); + assert_eq!( + iter.next().unwrap(), + Some(InstrInfo { + is_call: false, + length: 1, + addr: 0x11, + }) + ); + assert_eq!( + iter.next().unwrap(), + Some(InstrInfo { + is_call: true, + length: 5, + addr: 0x12, + }) + ); + assert_eq!(iter.next().unwrap(), None); + } + + #[test] + fn amd64() { + let dec = Amd64InstrDecoder::default(); + assert!(matches!( + dec.decode(0, b""), + Err(Error::TruncatedInstruction(0)) + )); + assert!(matches!( + dec.decode(0, b"\xE9"), + Err(Error::TruncatedInstruction(0)) + )); + assert_eq!( + dec.decode(0, b"\xE9\x00\x00\x00\x00").unwrap(), + InstrInfo { + addr: 0, + length: 5, + is_call: false + } + ); + assert_eq!( + dec.decode(123, b"\xE8\x00\x00\x00\x00").unwrap(), + InstrInfo { + addr: 123, + length: 5, + is_call: true, + } + ); + } + + #[test] + fn aarch64() { + let dec = Aarch64InstrDecoder::default(); + assert!(matches!( + dec.decode(0, b""), + Err(Error::TruncatedInstruction(0)) + )); + assert!(matches!( + dec.decode(0, b"\xAA"), + Err(Error::TruncatedInstruction(0)) + )); + assert_eq!( + dec.decode(33, b"\x00\x00\x3f\xd6").unwrap(), + InstrInfo { + addr: 33, + is_call: true, + length: 4, + } + ); + assert_eq!( + dec.decode(44, b"\x8d\x04\x00\x94").unwrap(), + InstrInfo { + addr: 44, + is_call: true, + length: 4, + } + ); + assert_eq!( + dec.decode(0, b"\x8d\x04\x00\x14").unwrap(), + InstrInfo { + addr: 0, + is_call: false, + length: 4, + } + ); + assert_eq!( + dec.decode(0x123444, b"\x1f\x20\x03\xd5").unwrap(), + InstrInfo { + addr: 0x123444, + is_call: false, + length: 4, + } + ); + } +} diff --git a/rust-crates/symblib/src/dwarf.rs b/rust-crates/symblib/src/dwarf.rs new file mode 100644 index 00000000..9ee6f6dc --- /dev/null +++ b/rust-crates/symblib/src/dwarf.rs @@ -0,0 +1,1416 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Abstraction for extracting information from object files with DWARF data. +//! +//! The main type here is [`Sections`], created via [`Sections::load`]. + +// Compiler complains about using the gimli constants in match patterns. +#![allow(non_upper_case_globals)] + +use crate::{debug, objfile, AnyError, VirtAddr}; +use fallible_iterator::FallibleIterator; +use gimli::{constants::*, AttributeValue as AV}; +use lru::LruCache; +use smallvec::{smallvec, SmallVec}; +use std::borrow::Cow; +use std::cell::RefCell; +use std::num::NonZeroU64; +use std::ops::Range; +use std::rc::Rc; +use std::{fmt, iter, mem, slice}; + +/// Shorthand for the [`gimli`] reader type that we use everywhere. +/// +/// Until BE binaries come back into favor we simply hard-code LE at +/// compile time, getting rid of a ton of unnecessary branching. +type R<'dwarf> = gimli::EndianSlice<'dwarf, gimli::LittleEndian>; + +/// Maximum number of compilation units to process per object file. +const MAX_COMP_UNITS: usize = 256 * 1024; + +/// Maximum depth of an inline function tree. +const MAX_INLINE_DEPTH: usize = 64 * 1024; + +/// Maximum size of the LRU cache for decoded units. +const UNIT_CACHE_SIZE: usize = 64; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occur during DWARF parsing. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Reader currently doesn't support big endian binaries")] + BigEndian, + + #[error("DWARF references a supplementary file but none was provided")] + MissingSupplementaryInfo, + + #[error("Reference points to non-existing unit")] + BadUnitRef, + + #[error("Reference points to invalid offset within a unit")] + BadUnitOffset, + + #[error("Reference attribute has unexpected type")] + BadRefAttrType, + + #[error("Language attribute has unexpected type")] + BadLangAttrType, + + #[error("Found inline subroutine outside of an enclosing function")] + InlineSubroutineOutsideFunction, + + #[error("DIE reference chain too long")] + DieReferenceChainTooLong, + + #[error("Encountered an invalid line table reference")] + BadLineTableReference, + + #[error("The call-file attribute is not a numeric index")] + CallFileNotNumeric, + + #[error("The inline tree is too deep")] + InlineTreeTooDeep, + + #[error("The input file has too many translation units")] + UnitLimitExceeded, + + #[error("The supplementary debug file has another supplementary file (unsupported)")] + RecursiveSupplementaryFile, + + #[error("File contains an invalid file index value `{}`", .0)] + InvalidFileIndex(u64), + + #[error("File contains an invalid directory index value `{}`", .0)] + InvalidDirectoryIndex(u64), + + #[error("Line table doesn't increase monotonically")] + NonMonotonicLineTable, + + #[error("objfile error")] + Objfile(#[from] objfile::Error), + + #[error(transparent)] + Other(AnyError), +} + +/// Conversion of [`gimli`] errors into ours. +/// +/// We erase the type here to prevent leaking [`gimli`] library types into our +/// public interface. If code needs to special-case based on particular gimli +/// errors, we should instead lift them into custom error variants. +impl From for Error { + fn from(e: gimli::Error) -> Self { + Self::Other(Box::new(e)) + } +} + +/// Collection of DWARF sections of an object file. +/// +/// Implements lazy decoding of DWARF information from object files. This is +/// currently a higher-level abstraction over the `gimli` library. +pub struct Sections<'obj> { + main: gimli::DwarfSections>>, + sup: Option>>>, +} + +impl<'obj> Sections<'obj> { + /// Reads the DWARF sections from the given object file. + pub fn load(obj: &objfile::Reader<'obj>) -> Result { + if !obj.is_little_endian() { + return Err(Error::BigEndian); + } + + Ok(Self { + main: gimli::DwarfSections::load(|id| obj.load_section_reloc(id.name().as_bytes()))?, + sup: None, + }) + } + + /// Additionally load data from a supplementary object file. + pub fn load_sup(&mut self, sup: &objfile::Reader<'obj>) -> Result { + if !sup.is_little_endian() { + return Err(Error::BigEndian); + } + + self.sup = Some(gimli::DwarfSections::load(|id| { + sup.load_section_reloc(id.name().as_bytes()) + })?); + + Ok(()) + } + + /// Collect a list of all translation units in the DWARF sections. + pub fn units(&self) -> Result> { + // Create a borrowing DWARF instance from our owned one. + fn borrow<'a>(section: &'a Option>) -> R<'a> { + let data = match section { + Some(x) => x, + None => &[][..], + }; + + R::new(data, gimli::LittleEndian) + } + + let mut dwarf = self.main.borrow(borrow); + if let Some(sup) = &self.sup { + dwarf.set_sup(sup.borrow(borrow)); + } + + // Collect all units now. We later need this to quickly seek to + // different units when we encounter cross-unit references. + let main = collect_unit_headers(&dwarf)?; + + // Do the same for the supplementary file if present. + let sup = match dwarf.sup() { + Some(sup) => collect_unit_headers(sup)?, + None => vec![], + }; + + let cache_size = UNIT_CACHE_SIZE + .try_into() + .expect("UNIT_CACHE_SIZE must be >0"); + + let unit_cache = RefCell::new(LruCache::new(cache_size)); + + Ok(Units { + dwarf, + main, + sup, + unit_cache, + }) + } +} + +/// Determines the location of a unit. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum UnitLocation { + /// Unit lives in the main DWARF file. + Main, + + /// Unit lives in the supplementary DWARF file. + Sup, +} + +/// Cached information about a unit. +struct CachedUnitInfo<'dwarf> { + gimli_unit: gimli::Unit>, + loc: UnitLocation, + producer: Option>, + language: Option, +} + +impl<'dwarf> CachedUnitInfo<'dwarf> { + fn from_gimli_unit( + loc: UnitLocation, + dwarf: &gimli::Dwarf>, + gimli_unit: gimli::Unit>, + ) -> Result { + let mut die_iter = gimli_unit.entries(); + + let mut producer = None; + let mut language = None; + + if let Some((_, die)) = die_iter.next_dfs()? { + let mut attrs = die.attrs(); + while let Some(attr) = attrs.next()? { + match attr.name() { + DW_AT_producer => { + producer = Some(dwarf.attr_string(&gimli_unit, attr.value())?); + } + DW_AT_language => { + let AV::Language(lang) = attr.value() else { + return Err(Error::BadLangAttrType); + }; + + language = Some(lang); + } + _ => {} + } + } + }; + + Ok(Self { + loc, + gimli_unit, + producer, + language, + }) + } +} + +/// List of all translation units in both the main and the supplementary DWARF file. +/// +/// Units can contain references to each other and this object serves as an +/// index that permits efficient lookups of other units for these cases. +pub struct Units<'dwarf> { + /// Borrowed view into the DWARF sections held in the [`Sections`] object. + dwarf: gimli::Dwarf>, + + /// List of all unit headers in the main DWARF file. + main: Vec>>, + + /// List of all unit headers in the supplementary DWARF file. + /// + /// Empty if no supplementary file is present. + sup: Vec>>, + + /// Cache of decoded unit information. + /// + /// This significantly reduces the need to constantly re-decode units + /// when resolving cross-unit references. + unit_cache: RefCell< + LruCache< + /* key: */ (UnitLocation, gimli::DebugInfoOffset), + /* value: */ Rc>, + >, + >, +} + +impl<'dwarf> Units<'dwarf> { + /// Iterate over all units in the main DWARF file. + pub fn iter<'units>(&'units self) -> UnitIter<'dwarf, 'units> { + UnitIter { + all: self, + iter: self.main.iter(), + } + } + + /// Locates the unit that contains the given offset into the `.debug_info` section. + fn unit_for_offset<'units>( + &'units self, + location: UnitLocation, + offset: gimli::DebugInfoOffset, + ) -> Result>> { + let headers = match location { + UnitLocation::Main => &self.main, + UnitLocation::Sup => &self.sup, + }; + + // Use binary search to locate the unit in question. + let header = match headers.binary_search_by_key(&offset, unit_start) { + // Exact match. + Ok(idx) => Some(&headers[idx]), + + // Our unit array is empty. + Err(0) => None, + + // Either found somewhere within a unit or outside of valid range. + Err(idx) => { + let matched = &headers[idx - 1]; + if unit_range(matched).contains(&offset) { + Some(matched) + } else { + None + } + } + }; + + // Compare with the result of a dumb linear search when compiled in debug mode. + // Both variants must be equivalent in all cases. + debug_assert_eq!( + header.map(|x| x as *const _), + headers + .iter() + .find(|unit| unit_range(unit).contains(&offset)) + .map(|x| x as *const _) + ); + + match header { + Some(header) => self.unit_for_header(location, header), + None => Ok(None), + } + } + + /// Creates a new `Unit` object for the given unit header. + fn unit_for_header<'units>( + &'units self, + location: UnitLocation, + header: &'units gimli::UnitHeader>, + ) -> Result>> { + let mut cache = self.unit_cache.borrow_mut(); + let cache_key = (location, unit_start(header)); + + // Fast path: if we have the decoded unit info cached, just return it. + if let Some(cached_info) = cache.get(&cache_key) { + return Ok(Some(Unit { + all: self, + unit: cached_info.clone(), + })); + } + + // Slow path: decode unit info now and cache it for the next time. + let dwarf = match location { + UnitLocation::Main => &self.dwarf, + UnitLocation::Sup => self.dwarf.sup().ok_or(Error::MissingSupplementaryInfo)?, + }; + + let unit_info = Rc::new(CachedUnitInfo::from_gimli_unit( + location, + dwarf, + dwarf.unit(*header)?, + )?); + + cache.put(cache_key, unit_info.clone()); + + Ok(Some(Unit { + all: self, + unit: unit_info, + })) + } +} + +/// Iterator over the translation units in a DWARF file. +/// +/// Created using [`Units::iter`]. Continuing iteration on errors is well- +/// defined and guaranteed not to run into infinite loops: units with bad +/// headers will simply be skipped. +#[derive(Clone)] +pub struct UnitIter<'dwarf, 'units> { + all: &'units Units<'dwarf>, + iter: slice::Iter<'units, gimli::UnitHeader>>, +} + +impl<'dwarf, 'units> FallibleIterator for UnitIter<'dwarf, 'units> { + type Item = Unit<'dwarf, 'units>; + type Error = Error; + + fn next(&mut self) -> Result> { + Ok(match self.iter.next() { + Some(header) => match self.all.unit_for_header(UnitLocation::Main, header) { + Ok(Some(unit)) => Some(unit), + Ok(None) => unreachable!(), + Err(e) => return Err(e), + }, + None => None, + }) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +/// Programming language this unit was compiled from. +/// +/// This currently only maps languages that we need special casing for, mapping +/// all other languages to `[Self::Other]`. The DWARF language attribute also +/// contains the language "version", e.g. C11, but we current simplify this to +/// just the language. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Lang { + /// C. + C, + + /// C++. + Cxx, + + /// Go. + Go, + + /// Rust. + Rust, + + /// Language is known but currently not mapped in this enum type. + Other, +} + +/// References a translation unit in a DWARF section. +#[derive(Clone)] +pub struct Unit<'dwarf, 'units> { + all: &'units Units<'dwarf>, + unit: Rc>, +} + +impl fmt::Debug for Unit<'_, '_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // We add the header length here to obtain the offset of the first DIE. + let hdr_len = self.unit.gimli_unit.header.size_of_header(); + let offs = self.unit.gimli_unit.header.offset().as_debug_info_offset(); + let offs = offs.expect("we don't inspect type sections").0 + hdr_len; + + let name = self.name().unwrap_or(Cow::Borrowed("")); + let is_sup = self.unit.loc == UnitLocation::Sup; + let loc = if is_sup { "sup::" } else { "" }; + write!(f, "Unit(\"{name}\" @ {loc}{offs:#08x})") + } +} + +impl<'dwarf, 'units> Unit<'dwarf, 'units> { + /// Gets the correct DWARF object for the location of this unit. + fn dwarf(&self) -> &'units gimli::Dwarf> { + match self.unit.loc { + UnitLocation::Main => &self.all.dwarf, + UnitLocation::Sup => self.all.dwarf.sup().expect( + "bug: units with this location should not be constructed if there's no sup", + ), + } + } + + /// Gets the name of the translation unit. + pub fn name(&self) -> Option> { + self.unit.gimli_unit.name.map(|x| x.to_string_lossy()) + } + + /// Gets the producer (compiler) that created this unit. + pub fn producer(&self) -> Option> { + self.unit.producer.map(|x| x.to_string_lossy()) + } + + /// Gets the programming language this unit was compiled from. + pub fn language(&self) -> Option { + Some(match self.unit.language? { + DW_LANG_C | DW_LANG_C89 | DW_LANG_C99 | DW_LANG_C11 | DW_LANG_C17 => Lang::C, + DW_LANG_C_plus_plus + | DW_LANG_C_plus_plus_03 + | DW_LANG_C_plus_plus_11 + | DW_LANG_C_plus_plus_14 + | DW_LANG_C_plus_plus_17 + | DW_LANG_C_plus_plus_20 => Lang::Cxx, + DW_LANG_Rust => Lang::Rust, + DW_LANG_Go => Lang::Go, + _ => Lang::Other, + }) + } + + /// Iterate over the PC ranges of this unit. + pub fn ranges(&self) -> Result> { + Ok(RangeIter(self.dwarf().unit_ranges(&self.unit.gimli_unit)?)) + } + + /// Iterate over subprograms in this translation unit. + pub fn subprograms<'unit>(&'unit self) -> SubprogramIter<'dwarf, 'units, 'unit> { + SubprogramIter { + unit: self, + die_iter: self.unit.gimli_unit.entries(), + next_mode: NextItemMode::Any, + } + } + + /// Construct an iterator over the line table. + pub fn line_iter(&self) -> Option> { + let line_program = self.unit.gimli_unit.line_program.as_ref()?.clone(); + Some(LineIter { + unit: self.clone(), + rows: line_program.rows(), + state: LineTableIterState::Void, + }) + } + + /// Resolves the given reference value. + /// + /// Currently supports the following [`AV`] types: + /// - [`AV::UnitRef`] + /// - [`AV::DebugInfoRef`] + /// - [`AV::DebugInfoRefSup`] + fn resolve_ref( + &self, + reference: AV>, + ) -> Result<(Unit<'dwarf, 'units>, gimli::UnitOffset)> { + use UnitLocation as UL; + + // Determine file and offset from the attribute value type. + let (location, offs) = match (self.unit.loc, reference) { + // Reference within same CU. Simple case, do early exit. + (_, AV::UnitRef(offs)) => return Ok((self.clone(), offs)), + + // Reference into another CU within this file. + (location, AV::DebugInfoRef(offs)) => (location, offs), + + // Reference from the main DWARF into a CU in the supplementary file. + (UL::Main, AV::DebugInfoRefSup(offs)) => (UL::Sup, offs), + + // Reference into the supplementary DWARF while already in the supplementary file. + (UL::Sup, AV::DebugInfoRefSup(_)) => return Err(Error::RecursiveSupplementaryFile), + + // Any other attribute type is a violation of the specification. + _ => return Err(Error::BadRefAttrType), + }; + + let Some(refd_unit) = self.all.unit_for_offset(location, offs)? else { + return Err(Error::BadUnitRef); + }; + let Some(offs) = offs.to_unit_offset(&refd_unit.unit.gimli_unit.header) else { + return Err(Error::BadUnitOffset); + }; + + Ok((refd_unit, offs)) + } +} + +/// Determines how the next item is selected. +#[derive(Debug, Copy, Clone)] +enum NextItemMode { + /// Selects whatever DIE is next (depth-first search). + Any, + + /// Selects the next sibling. + SkipChildren, +} + +/// Iterator over the subprograms in a [`Unit`]. +/// +/// Created via [`Unit::subprograms`]. +#[derive(Clone)] +pub struct SubprogramIter<'dwarf, 'units, 'unit: 'units> { + unit: &'unit Unit<'dwarf, 'units>, + die_iter: gimli::EntriesCursor<'unit, 'unit, R<'dwarf>>, + next_mode: NextItemMode, +} + +impl<'dwarf, 'units, 'unit: 'units> FallibleIterator for SubprogramIter<'dwarf, 'units, 'unit> { + type Item = Subprogram<'dwarf, 'units>; + type Error = Error; + + fn next(&mut self) -> Result> { + // Note: this is not particularly efficient if the DWARF file doesn't + // have sibling links. We might want to give the `Subprogram` instances + // a link to this instance and have them send the offset that they + // ended their iteration at, but that's not exactly trivial to do while + // also not allocating anything (no `Arc`) and still + // implementing `FallibleIterator` (can't return refs to `self`). + + loop { + let die = 'found_die: { + // Reset mode and skip children if we were asked to. + if let NextItemMode::SkipChildren = + mem::replace(&mut self.next_mode, NextItemMode::Any) + { + if let Some(sibling) = self.die_iter.next_sibling()? { + break 'found_die sibling; + } + // If no sibling was found, continue normal DFS. + } + + match self.die_iter.next_dfs()? { + Some(x) => x.1, + None => return Ok(None), + } + }; + + // Skip irrelevant records, but not their children: they might + // contain records that we do care about. + if !matches!(die.tag(), DW_TAG_subprogram | DW_TAG_entry_point) { + continue; + } + + // For the record types selected above, skip child nodes when this + // iterator is woken up next time: they are either abstract or dealt + // with by the `Subprogram` object that we yield here. + self.next_mode = NextItemMode::SkipChildren; + + // Skip over abstract records (and their children). + if die_is_abstract(die)? { + continue; + } + + // Still here? We have a relevant record that we want to yield. + return Ok(Some(Subprogram { + unit: self.unit.clone(), + info: SubprogramInfo::from_die(0, self.unit.clone(), die)?, + die_iter: self.die_iter.clone(), + })); + } + } +} + +/// Describes a top-level (non-inline) subprogram in the application. +pub struct Subprogram<'dwarf, 'units> { + unit: Unit<'dwarf, 'units>, + info: SubprogramInfo<'dwarf, 'units>, + die_iter: gimli::EntriesCursor<'units, 'units, R<'dwarf>>, +} + +impl<'dwarf, 'units> Subprogram<'dwarf, 'units> { + /// Destructively extracts the [`SubprogramInfo`]. + pub fn into_info(self) -> SubprogramInfo<'dwarf, 'units> { + self.info + } + + /// Destructively iterate over both this subroutine and and all inline instances. + /// + /// TODO: impl IntoFallibleIterator instead? + pub fn into_iter( + self, + ) -> impl FallibleIterator, Error = Error> { + let inline_iter = self.inline_instances(); + let self_iter = iter::once(Ok(self.into_info())); + let self_iter = fallible_iterator::convert(self_iter); + self_iter.chain(inline_iter) + } + + /// Iterate over functions that have been inlined into this subroutine. + pub fn inline_instances(&self) -> InlineInstanceIter<'dwarf, 'units> { + InlineInstanceIter { + unit: self.unit.clone(), + die_iter: self.die_iter.clone(), + tag_stack: smallvec![DW_TAG_subprogram], + fn_tree_depth: 1, + } + } +} + +/// Iterator over the inline instances in a [`Subprogram`]. +/// +/// Created via [`Subprogram::inline_instances`]. +pub struct InlineInstanceIter<'dwarf, 'units> { + unit: Unit<'dwarf, 'units>, + die_iter: gimli::EntriesCursor<'units, 'units, R<'dwarf>>, + tag_stack: SmallVec<[DwTag; 64]>, + fn_tree_depth: u64, +} + +impl<'dwarf, 'units> FallibleIterator for InlineInstanceIter<'dwarf, 'units> { + type Item = SubprogramInfo<'dwarf, 'units>; + type Error = Error; + + fn next(&mut self) -> Result> { + fn tag_affects_depth(x: DwTag) -> bool { + matches!(x, DW_TAG_subprogram | DW_TAG_inlined_subroutine) + } + + loop { + let Some((depth_delta, die)) = self.die_iter.next_dfs()? else { + return Ok(None); + }; + + // Remove as many levels as we have left behind, plus one since we + // always push the current element even if it doesn't have children. + self.fn_tree_depth -= (0..1 - depth_delta) + .flat_map(|_| self.tag_stack.pop()) + .filter(|&x| tag_affects_depth(x)) + .count() as u64; + + if self.tag_stack.is_empty() { + break Ok(None); + } + + if self.tag_stack.len() + 1 > MAX_INLINE_DEPTH { + return Err(Error::InlineTreeTooDeep); + } + + self.tag_stack.push(die.tag()); + + if !tag_affects_depth(die.tag()) { + continue; + } + + self.fn_tree_depth += 1; + + // Skip abstract DIEs -- they are instead caught via references + // in concrete instances and have relative address ranges that + // only make sense in that concrete context. + if die_is_abstract(die)? { + continue; + } + + break Ok(Some(SubprogramInfo::from_die( + self.fn_tree_depth - 1, + self.unit.clone(), + die, + )?)); + } + } +} + +/// Common information for both top-level subroutines and inline instances. +pub struct SubprogramInfo<'dwarf, 'units> { + fn_tree_depth: u64, + name: Option>, + link_name: Option>, + call_file: Option>, + call_line: Option, + die_ranges: Option>>, +} + +impl fmt::Debug for SubprogramInfo<'_, '_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "SubprogramInfo(name = {:?})", self.name()) + } +} + +impl<'dwarf, 'units> SubprogramInfo<'dwarf, 'units> { + /// Returns the depth of this function in the inline tree. + /// + /// The outermost, top-level function has a depth of `0`. + pub fn depth(&self) -> u64 { + self.fn_tree_depth + } + + /// Merge this instance with another, preferring entries from `self`. + fn merge_from(&mut self, other: Self) { + if self.name.is_none() { + self.name = other.name; + } + if self.link_name.is_none() { + self.link_name = other.link_name; + } + if self.die_ranges.is_none() { + self.die_ranges = other.die_ranges; + } + } + + /// Extract required information from a DIE. + fn from_die( + fn_tree_depth: u64, + unit: Unit<'dwarf, 'units>, + die: &gimli::DebuggingInformationEntry<'_, '_, R<'dwarf>>, + ) -> Result { + Self::from_die_impl(fn_tree_depth, unit, die, 0) + } + + fn from_die_impl( + fn_tree_depth: u64, + unit: Unit<'dwarf, 'units>, + die: &gimli::DebuggingInformationEntry<'_, '_, R<'dwarf>>, + recursion_depth: usize, + ) -> Result { + // Protect against theoretically-possible infinite reference loops (from abstract origins & specifications). + // recursion_depth > 2 is very rare. > 3 is yet to be seen. Using > 4 for good measure. + if recursion_depth > 4 { + return Err(Error::DieReferenceChainTooLong); + } + + // Iterate the attributes and pick what we need. This is faster than + // calling `attr_value` for each attribute since this would internally + // loop over all attributes for each call. + let mut name = None; + let mut link_name = None; + let mut abstract_origin = None; + let mut spec = None; + let mut call_line = None; + let mut call_file = None; + let mut attrs = die.attrs(); + while let Some(attr) = attrs.next()? { + match attr.name() { + // Reading is expensive: save unit + attribute value and decode lazily. + DW_AT_name => name = Some(UnitAV(unit.clone(), attr.value())), + DW_AT_linkage_name => link_name = Some(UnitAV(unit.clone(), attr.value())), + DW_AT_call_file => call_file = Some(UnitAV(unit.clone(), attr.value())), + + // Reading is cheap: decode immediately. + DW_AT_call_line => call_line = attr.value().udata_value(), + DW_AT_abstract_origin => abstract_origin = Some(attr.value()), + DW_AT_specification => spec = Some(attr.value()), + + // Ignore all other attribute types. + _ => (), + } + } + + let mut info = SubprogramInfo { + fn_tree_depth, + name, + link_name, + call_file, + call_line: call_line.and_then(NonZeroU64::new), + die_ranges: Some(unit.dwarf().die_ranges(&unit.unit.gimli_unit, die)?), + }; + + // If an abstract origin or a specification are present, also recurse into these. + // `merge_from` prefers properties from `self`, making sure that we use the most + // concrete information for our current DIE. + for ref_attr in abstract_origin.into_iter().chain(spec) { + let (refd_unit, refd_offs) = unit.resolve_ref(ref_attr)?; + let refd_die = refd_unit.unit.gimli_unit.entry(refd_offs)?; + info.merge_from(Self::from_die_impl( + fn_tree_depth, + refd_unit.clone(), + &refd_die, + recursion_depth + 1, + )?); + } + + Ok(info) + } + + /// Determine the name of this function. + pub fn name(&self) -> Result>> { + // Prefer the linkage name if it is present. + if let Some(UnitAV(ref unit, av)) = self.link_name { + let x = unit.dwarf().attr_string(&unit.unit.gimli_unit, av)?; + return Ok(Some(x.to_string_lossy())); + }; + + // Fallback to regular name. + if let Some(UnitAV(ref unit, av)) = self.name { + // TODO: must merge with containing namespaces and modules + let x = unit.dwarf().attr_string(&unit.unit.gimli_unit, av)?; + let x = x.to_string_lossy(); + return Ok(Some(x)); + } + + Ok(None) + } + + /// Destructively retrieve the DIE ranges for this routine. + /// + /// This consumes the range iterator on the first call, causing the next + /// [`Self::take_ranges`] call to return [`None`]. This is a quirk that is + /// required to work around gimli's DIE range iterator not implementing + /// [`Clone`]. + pub fn take_ranges(&mut self) -> Option> { + self.die_ranges.take().map(RangeIter) + } + + /// Reads the call file for this function, if present. + pub fn call_file(&self) -> Result>> { + let Some(UnitAV(ref unit, av)) = self.call_file else { + return Ok(None); + }; + let Some(ref line_program) = unit.unit.gimli_unit.line_program else { + return Err(Error::BadLineTableReference); + }; + let AV::FileIndex(file_idx) = av else { + return Err(Error::CallFileNotNumeric); + }; + + Ok(Some(SourceFile::read_from_linetab( + unit.clone(), + line_program.header(), + SourceFileId(file_idx), + )?)) + } + + /// Reads the call line for this function, if present. + pub fn call_line(&self) -> Option { + self.call_line + } +} + +/// Iterator yielding the PC ranges of a subroutine or inline instance. +/// +/// Thin wrapper around the corresponding gimli type to prevent leaking gimli +/// types into the public interface of this module. +pub struct RangeIter<'dwarf>(gimli::RangeIter>); + +impl<'dwarf> FallibleIterator for RangeIter<'dwarf> { + type Item = Range; + type Error = Error; + + fn next(&mut self) -> Result> { + Ok(self.0.next()?.map(|x| x.begin..x.end)) + } +} + +/// Opaque ID that uniquely identifies a file within a unit. +/// +/// TODO: should probably include unit offset to ensure global uniqueness? +#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] +pub struct SourceFileId(u64); + +/// File in the DWARF line table. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct SourceFile<'dwarf> { + /// Unique ID within a unit. + pub id: SourceFileId, + /// Directory component of the source path, if known. + pub dir: Option>, + /// File name component of the source path. + pub name: Cow<'dwarf, str>, +} + +impl<'dwarf> fmt::Display for SourceFile<'dwarf> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let dir = self.dir.as_deref().unwrap_or(""); + write!(f, "{}/{}", dir, self.name) + } +} + +impl<'dwarf> SourceFile<'dwarf> { + fn read_from_linetab<'units>( + unit: Unit<'dwarf, 'units>, + header: &gimli::LineProgramHeader>, + id: SourceFileId, + ) -> Result { + let Some(file_entry) = header.file(id.0) else { + return Err(Error::InvalidFileIndex(id.0)); + }; + + let name_av = file_entry.path_name(); + let name_slice = unit.dwarf().attr_string(&unit.unit.gimli_unit, name_av)?; + let name = name_slice.to_string_lossy(); + + let Some(dir_av) = file_entry.directory(header) else { + // `0` refers to the `DW_AT_compdir` attribute of the CU: if we + // ended up here, this means that the CU does not have the compdir + // attribute. I don't think that the DWARF spec permits that, but + // we've seen it in mainstream executables, so we allow it anyway. + if file_entry.directory_index() == 0 { + return Ok(Self { + id, + dir: None, + name, + }); + } + + return Err(Error::InvalidDirectoryIndex(file_entry.directory_index())); + }; + + let dir_slice = unit.dwarf().attr_string(&unit.unit.gimli_unit, dir_av)?; + let dir = Some(dir_slice.to_string_lossy()); + + Ok(Self { id, dir, name }) + } +} + +/// Associates a PC range with a source file and line number. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct LineTableEntry<'dwarf> { + /// PC range being described by this line table entry. + pub rng: Range, + /// Source file that corresponds to this range. + pub file: SourceFile<'dwarf>, + /// Line number within the source file, starting at `1`. + pub line: Option, +} + +/// Internal state of [`LineIter`]. +#[derive(Debug, Clone, Eq, PartialEq)] +enum LineTableIterState<'dwarf> { + /// We are in the void between ranges. + Void, + /// We are within a line table range. + InRange(LineTableEntry<'dwarf>), +} + +impl<'dwarf> LineTableIterState<'dwarf> { + /// Constructs a [`Self::InRange`] variant of this enum from a gimli row. + pub fn from_row<'units>( + unit: Unit<'dwarf, 'units>, + header: &gimli::LineProgramHeader>, + row: &gimli::LineRow, + ) -> Result> { + Ok(Self::InRange(LineTableEntry { + file: SourceFile::read_from_linetab(unit, header, SourceFileId(row.file_index()))?, + rng: row.address()..row.address(), + line: row.line(), + })) + } + + /// Consume this instance, extracting the current entry. + /// + /// # Panics + /// + /// If currently in [`Self::Void`] state. + pub fn unwrap_entry(self) -> LineTableEntry<'dwarf> { + match self { + Self::Void => panic!("attempted unwrapping void state as range"), + Self::InRange(entry) => entry, + } + } +} + +/// Iterator yielding all line table entries in a unit. +/// +/// Constructed via [`Unit::line_iter`]. +pub struct LineIter<'dwarf, 'units> { + unit: Unit<'dwarf, 'units>, + rows: gimli::LineRows, gimli::IncompleteLineProgram>>, + state: LineTableIterState<'dwarf>, +} + +impl<'dwarf, 'units> FallibleIterator for LineIter<'dwarf, 'units> { + type Item = LineTableEntry<'dwarf>; + type Error = Error; + + fn next(&mut self) -> Result> { + use LineTableIterState::*; + + loop { + let Some((header, row)) = self.rows.next_row()? else { + // Line table exhausted: yield final record if we still have one stashed. + return Ok(match mem::replace(&mut self.state, Void) { + Void => None, + InRange(entry) => Some(entry), + }); + }; + + let active = match (&mut self.state, row.end_sequence()) { + // Sequence ends but we didn't even know that we are in one. + (Void, true) => continue, + + // New sequence starts here: update state but don't yield anything. + (Void, false) => { + self.state = LineTableIterState::from_row(self.unit.clone(), header, row)?; + continue; + } + + // Sequence is ending and we're moving into the void. + (state @ InRange { .. }, true) => { + let mut old_state = mem::replace(state, Void).unwrap_entry(); + old_state.rng.end = row.address(); + return Ok(Some(old_state)); + } + + // Sequence is ongoing: handle outside this match. + (InRange(entry), false) => entry, + }; + + // DWARF5 [6.2.5]: + // > Within a sequence, addresses and operation pointers may only increase. + // + // While this is clearly not permitted per specification, it is unfortunately + // quite common in practice, so we have to handle it as graceful as possible. + if active.rng.end > row.address() { + debug!( + "Non-monotonic line table sequence (jumping from {:#08x} -> {:#08x})", + active.rng.end, + row.address() + ); + + let new = LineTableIterState::from_row(self.unit.clone(), header, row)?; + let mut old = mem::replace(&mut self.state, new).unwrap_entry(); + + // Since we have no idea where this would actually end we just + // arbitrarily assume it to be 1 byte long. + old.rng.end = old.rng.start + 1; + + return Ok(Some(old)); + } + + // Extend range. + active.rng.end = row.address(); + + // Neither line number nor the file changed: done here. + if active.file.id == SourceFileId(row.file_index()) && active.line == row.line() { + continue; + } + + // Sequence is ongoing and something changed: create new record. + let new_state = LineTableIterState::from_row(self.unit.clone(), header, row)?; + debug_assert_ne!(&new_state, &self.state); + let prev_state = mem::replace(&mut self.state, new_state); + return Ok(Some(prev_state.unwrap_entry())); + } + } +} + +/// Pair of an attribute value and the corresponding unit. +struct UnitAV<'dwarf, 'units>(Unit<'dwarf, 'units>, AV>); + +/// Unwraps the start offset of a unit into a generic [`usize`]. +fn unit_start(unit: &gimli::UnitHeader>) -> gimli::DebugInfoOffset { + unit.offset() + .as_debug_info_offset() + .expect("we only collect non-type units") +} + +/// Constructs the offset [`Range`] for a unit. +fn unit_range(unit: &gimli::UnitHeader>) -> Range { + let start = unit_start(unit); + let end = gimli::DebugInfoOffset(start.0 + unit.length_including_self()); + start..end +} + +/// Inspect the given DIE and determine whether it is an abstract record +/// that doesn't actually describe a location in the executable by itself. +fn die_is_abstract(die: &gimli::DebuggingInformationEntry<'_, '_, R<'_>>) -> Result { + let mut attrs = die.attrs(); + while let Some(attr) = attrs.next()? { + match attr.name() { + // DWARF 5 [3.3.8.1]: + // > Any subroutine entry that contains a DW_AT_inline attribute + // > whose value is other than DW_INL_not_inlined is known as an + // > abstract instance root. + DW_AT_inline => match attr.value() { + AV::Inline(DW_INL_not_inlined) => (), + AV::Inline(_) => return Ok(true), + _ => (), + }, + + // DWARF 5 [2.13.1]: + // > A debugging information entry that represents a non-defining or + // > otherwise incomplete declaration of a program entity has a + // > DW_AT_declaration attribute, which is a flag. + DW_AT_declaration => { + if let AV::Flag(true) = attr.value() { + return Ok(true); + } + } + + _ => (), + } + } + + Ok(false) +} + +/// Collect list of all unit headers in a DWARF file. +fn collect_unit_headers<'obj>( + dwarf: &gimli::Dwarf>, +) -> Result>>> { + let mut unit_iter = dwarf.units().enumerate(); + let mut units = Vec::with_capacity(unit_iter.size_hint().0); + + while let Some((i, unit)) = unit_iter.next()? { + if i >= MAX_COMP_UNITS { + return Err(Error::UnitLimitExceeded); + } + + units.push(unit); + } + + Ok(units) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{objfile, tests::testdata}; + + #[test] + fn inline() { + let obj = objfile::File::load(&testdata("inline")).unwrap(); + let obj = obj.parse().unwrap(); + + let dwarf = Sections::load(&obj).unwrap(); + let units = dwarf.units().unwrap(); + + let mut unit_iter = units.iter(); + let inline_c = unit_iter.next().unwrap().unwrap(); + assert!(unit_iter.next().unwrap().is_none()); + + assert_eq!(inline_c.name().unwrap(), "inline.c"); + assert_eq!( + inline_c.producer().unwrap(), + "GNU C17 12.2.0 -mlittle-endian -mabi=lp64 -g -O2 -fasynchronous-unwind-tables" + ); + assert_eq!(inline_c.language().unwrap(), Lang::C); + + // Output `llvm-dwarfdump --debug-line` + // ==================================== + // + // include_directories[ 0] = "/media/share/Development/prodfiler/libpf-rs/testdata" + // file_names[ 0]: + // name: "inline.c" + // dir_index: 0 + // file_names[ 1]: + // name: "inline.c" + // dir_index: 0 + // file_names[ 2]: + // name: "" + // dir_index: 0 + // + // Address Line Column File ISA Discriminator Flags + // ------------------ ------ ------ ------ --- ------------- ------------- + // 0x00000000000007a0 6 18 1 0 0 is_stmt + // 0x00000000000007a0 7 3 1 0 0 is_stmt + // 0x00000000000007b0 10 18 1 0 0 is_stmt + // 0x00000000000007b0 11 3 1 0 0 is_stmt + // 0x00000000000007b4 14 18 1 0 0 is_stmt + // 0x00000000000007b4 15 3 1 0 0 is_stmt + // 0x00000000000007c0 18 18 1 0 0 is_stmt + // 0x00000000000007c0 19 3 1 0 0 is_stmt + // 0x00000000000007c4 19 3 1 0 0 is_stmt end_sequence + // 0x0000000000000640 38 21 1 0 0 is_stmt + // 0x0000000000000640 39 3 1 0 0 is_stmt + // 0x0000000000000640 38 21 1 0 0 + // 0x0000000000000648 39 3 1 0 0 + // 0x000000000000064c 40 3 1 0 0 is_stmt + // 0x000000000000064c 34 12 1 0 0 is_stmt + // 0x000000000000064c 35 3 1 0 0 is_stmt + // 0x000000000000064c 30 12 1 0 0 is_stmt + // 0x000000000000064c 31 3 1 0 0 is_stmt + // 0x000000000000064c 26 12 1 0 0 is_stmt + // 0x000000000000064c 27 3 1 0 0 is_stmt + // 0x000000000000064c 22 12 1 0 0 is_stmt + // 0x000000000000064c 23 3 1 0 0 is_stmt + // 0x0000000000000658 41 1 1 0 0 + // 0x0000000000000664 41 1 1 0 0 end_sequence + + let actual_items: Vec<_> = inline_c.line_iter().unwrap().collect().unwrap(); + + let expected_items = [ + (0x7a0..0x7a0, 6), + (0x7a0..0x7b0, 7), + (0x7b0..0x7b0, 10), + (0x7b0..0x7b4, 11), + (0x7b4..0x7b4, 14), + (0x7b4..0x7c0, 15), + (0x7c0..0x7c0, 18), + (0x7c0..0x7c4, 19), + // end_sequence + (0x640..0x640, 38), + (0x640..0x640, 39), + (0x640..0x648, 38), + (0x648..0x64c, 39), + (0x64c..0x64c, 40), + (0x64c..0x64c, 34), + (0x64c..0x64c, 35), + (0x64c..0x64c, 30), + (0x64c..0x64c, 31), + (0x64c..0x64c, 26), + (0x64c..0x64c, 27), + (0x64c..0x64c, 22), + (0x64c..0x658, 23), + (0x658..0x664, 41), + // end_sequence + ]; + + assert_eq!(actual_items.len(), expected_items.len()); + + let inline_c_path = "/media/share/Development/prodfiler/libpf-rs/testdata/inline.c"; + for (actual, expected) in iter::zip(actual_items, expected_items) { + assert_eq!(actual.rng, expected.0, "range mismatch"); + assert_eq!(actual.line, NonZeroU64::new(expected.1), "line mismatch"); + assert_eq!(actual.file.id, SourceFileId(1), "file ID mismatch"); + assert_eq!(actual.file.to_string(), inline_c_path, "file path mismatch"); + } + + // Output `llvm-dwarfdump --debug-info` + // ==================================== + // + // NOTE: output manually filtered and re-indented to aid readability + // + // 0x000c: DW_TAG_compile_unit + // DW_AT_name ("inline.c") + // DW_AT_low_pc (0x0000000000000000) + // DW_AT_ranges (0x0000000c + // [0x00000000000007a0, 0x00000000000007c4) + // [0x0000000000000640, 0x0000000000000664)) + // 0x0069: DW_TAG_subprogram + // DW_AT_name ("main") + // DW_AT_low_pc (0x0000000000000640) + // DW_AT_high_pc (0x0000000000000664) + // 0x008b: DW_TAG_inlined_subroutine + // DW_AT_abstract_origin (0x0000013a "a_inline") + // DW_AT_low_pc (0x000000000000064c) + // DW_AT_high_pc (0x0000000000000658) + // DW_AT_call_file ("[...]/testdata/inline.c") + // DW_AT_call_line (40) + // 0x00b0: DW_TAG_inlined_subroutine + // DW_AT_abstract_origin (0x00000144 "b_inline") + // DW_AT_low_pc (0x000000000000064c) + // DW_AT_high_pc (0x0000000000000658) + // DW_AT_call_file ("[...]/testdata/inline.c") + // DW_AT_call_line (35) + // 0x00cf: DW_TAG_inlined_subroutine + // DW_AT_abstract_origin (0x0000014e "c_inline") + // DW_AT_low_pc (0x000000000000064c) + // DW_AT_high_pc (0x0000000000000658) + // DW_AT_call_file ("[...]/testdata/inline.c") + // DW_AT_call_line (31) + // 0x00ee: DW_TAG_inlined_subroutine + // DW_AT_abstract_origin (0x00000158 "d_inline") + // DW_AT_low_pc (0x000000000000064c) + // DW_AT_high_pc (0x0000000000000658) + // DW_AT_call_file ("[...]/testdata/inline.c") + // DW_AT_call_line (27) + // 0x013a: DW_TAG_subprogram + // DW_AT_name ("a_inline") + // DW_AT_inline (DW_INL_declared_inlined) + // 0x0144: DW_TAG_subprogram + // DW_AT_name ("b_inline") + // DW_AT_inline (DW_INL_declared_inlined) + // 0x014e: DW_TAG_subprogram + // DW_AT_name ("c_inline") + // DW_AT_inline (DW_INL_declared_inlined) + // 0x0158: DW_TAG_subprogram + // DW_AT_name ("d_inline") + // DW_AT_inline (DW_INL_declared_inlined) + // 0x0162: DW_TAG_subprogram + // DW_AT_name ("a") + // DW_AT_low_pc (0x00000000000007c0) + // DW_AT_high_pc (0x00000000000007c4) + // 0x018e: DW_TAG_subprogram + // DW_AT_name ("b") + // DW_AT_decl_file ("[...]/testdata/inline.c") + // DW_AT_low_pc (0x00000000000007b4) + // DW_AT_high_pc (0x00000000000007b8) + // 0x01ba: DW_TAG_subprogram + // DW_AT_name ("c") + // DW_AT_decl_file ("[...]/testdata/inline.c") + // DW_AT_low_pc (0x00000000000007b0) + // DW_AT_high_pc (0x00000000000007b4) + // 0x01e6: DW_TAG_subprogram + // DW_AT_name ("d") + // DW_AT_decl_file ("[...]/testdata/inline.c") + // DW_AT_low_pc (0x00000000000007a0) + // DW_AT_high_pc (0x00000000000007ac) + // 0x0220: DW_TAG_subprogram + // DW_AT_name ("__builtin_puts") + // DW_AT_declaration (true) + + assert_eq!(inline_c.name().unwrap(), "inline.c"); + + let unit_ranges: Vec<_> = inline_c.ranges().unwrap().collect().unwrap(); + assert_eq!(unit_ranges, [0x7a0..0x7c4, 0x640..0x664]); + + let mut sp_iter = inline_c.subprograms(); + + // 0x0069 + let mut main = sp_iter.next().unwrap().unwrap(); + assert_eq!(main.info.depth(), 0); + assert_eq!(main.info.name().unwrap().unwrap(), "main"); + assert!(main.info.call_line().is_none()); + assert!(main.info.call_file().unwrap().is_none()); + let rng: Vec<_> = main.info.take_ranges().unwrap().collect().unwrap(); + assert_eq!(rng, [0x640..0x664]); + + // 0x008b + let mut main_ii = main.inline_instances(); + let mut a_inline = main_ii.next().unwrap().unwrap(); + assert_eq!(a_inline.depth(), 1); + assert_eq!(a_inline.name().unwrap().unwrap(), "a_inline"); + assert_eq!(a_inline.call_line().unwrap().get(), 40); + assert_eq!(a_inline.call_file().unwrap().unwrap().id, SourceFileId(1)); + assert_eq!( + a_inline.call_file().unwrap().unwrap().to_string(), + inline_c_path + ); + let rng: Vec<_> = a_inline.take_ranges().unwrap().collect().unwrap(); + assert_eq!(rng, [0x64c..0x658]); + + // 0x00b0 + let b_inline = main_ii.next().unwrap().unwrap(); + assert_eq!(b_inline.name().unwrap().unwrap(), "b_inline"); + assert_eq!(b_inline.depth(), 2); + + // 0x00cf + let b_inline = main_ii.next().unwrap().unwrap(); + assert_eq!(b_inline.name().unwrap().unwrap(), "c_inline"); + assert_eq!(b_inline.depth(), 3); + + // 0x00ee + let b_inline = main_ii.next().unwrap().unwrap(); + assert_eq!(b_inline.name().unwrap().unwrap(), "d_inline"); + assert_eq!(b_inline.depth(), 4); + + assert!(main_ii.next().unwrap().is_none()); + + // 0x013a..=0x0158 should be skipped due to being abstract (`DW_AT_inline`) + + // 0x0162 + let mut a = sp_iter.next().unwrap().unwrap(); + assert_eq!(a.info.depth(), 0); + assert_eq!(a.info.name().unwrap().unwrap(), "a"); + assert!(a.info.call_line().is_none()); + assert!(a.info.call_file().unwrap().is_none()); + let rng: Vec<_> = a.info.take_ranges().unwrap().collect().unwrap(); + assert_eq!(rng, [0x7c0..0x7c4]); + + // 0x018e + let b = sp_iter.next().unwrap().unwrap(); + assert_eq!(b.info.depth(), 0); + assert_eq!(b.info.name().unwrap().unwrap(), "b"); + + // 0x01ba + let c = sp_iter.next().unwrap().unwrap(); + assert_eq!(c.info.depth(), 0); + assert_eq!(c.info.name().unwrap().unwrap(), "c"); + + // 0x01e6 + let d = sp_iter.next().unwrap().unwrap(); + assert_eq!(d.info.depth(), 0); + assert_eq!(d.info.name().unwrap().unwrap(), "d"); + + // 0x0220 should be skipped due to `DW_AT_declaration` + + assert!(sp_iter.next().unwrap().is_none()); + } +} diff --git a/rust-crates/symblib/src/fileid.rs b/rust-crates/symblib/src/fileid.rs new file mode 100644 index 00000000..cb1473cc --- /dev/null +++ b/rust-crates/symblib/src/fileid.rs @@ -0,0 +1,143 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Types for uniquely identifying files. + +use base64::Engine; +use sha2::digest::FixedOutput; +use sha2::Digest as _; +use std::io::Read as _; +use std::{fmt, fs, io, path}; + +/// Size of the head and tail blocks used for partially hashing ELF files. +const PARTIAL_HASH_SIZE: u64 = 4096; + +/// Hash-based unique file identifier. +/// +/// This ID is compatible with the file ID format that is calculated and sent +/// to the collection agent by our host agent. +/// +/// https://github.com/open-telemetry/opentelemetry-ebpf-profiler/blob/cd3963/libpf/fileid.go#L124 +#[repr(transparent)] +#[derive(PartialEq, Eq, Hash, Clone, Copy)] +pub struct FileId(u128); + +impl FileId { + /// Construct the ID from two `u64` halves. + pub fn from_parts(hi: u64, lo: u64) -> Self { + Self((hi as u128) << 64 | lo as u128) + } + + /// Calculates the file ID for the file at the given path. + pub fn from_path(path: &path::Path) -> io::Result { + Self::from_stream(&fs::File::open(path)?) + } + + /// Calculates the file ID from the given seekable stream. + /// + /// If this function succeeds, the stream is seeked back to the original + /// position, otherwise the file position is undefined. + pub fn from_stream(mut stream: impl io::Read + io::Seek) -> io::Result { + let prev_pos = stream.seek(io::SeekFrom::End(0))?; + let stream_len = stream.stream_position()?; + let mut hasher = sha2::Sha256::new(); + + // Hash first 4096 bytes. + stream.seek(io::SeekFrom::Start(0))?; + io::copy(&mut stream.by_ref().take(PARTIAL_HASH_SIZE), &mut hasher)?; + + // Hash last 4096 bytes. + let tail_start = stream_len.saturating_sub(PARTIAL_HASH_SIZE); + stream.seek(io::SeekFrom::Start(tail_start))?; + io::copy(&mut stream.by_ref().take(PARTIAL_HASH_SIZE), &mut hasher)?; + + // Hash length. + hasher.update(u64::to_be_bytes(stream_len)); + + stream.seek(io::SeekFrom::Start(prev_pos))?; + + let digest: [u8; 32] = hasher.finalize_fixed().into(); + let truncated: [u8; 16] = digest[..16].try_into().unwrap(); + Ok(Self(u128::from_be_bytes(truncated))) + } + + /// Formats the ID in the base64 format used in our ES indices. + pub fn format_es(&self) -> String { + let mut out = String::with_capacity(128); + ES_B64_ENGINE.encode_string(self.0.to_be_bytes(), &mut out); + out + } + + /// Try to parse a file ID in ES format. + /// + /// Returns `None` if the input is not a valid file ID. + pub fn try_parse_es(text_repr: &str) -> Option { + let bytes = ES_B64_ENGINE.decode(text_repr).ok()?; + let sized: [u8; 16] = bytes.try_into().ok()?; + Some(Self(u128::from_be_bytes(sized))) + } + + /// Formats the ID as a lower-case hexadecimal number. + pub fn format_hex(&self) -> String { + format!("{:032x}", self.0) + } + + /// Try to parse a file ID from a string in hex format. + pub fn try_parse_hex(text_repr: &str) -> Option { + let tmp = u128::from_str_radix(text_repr, 16).ok()?; + Some(Self(tmp)) + } +} + +/// base64 engine that en-/decodes in our ES base64 representation. +static ES_B64_ENGINE: base64::engine::GeneralPurpose = base64::engine::GeneralPurpose::new( + &base64::alphabet::URL_SAFE, + base64::engine::GeneralPurposeConfig::new() + .with_encode_padding(false) + .with_decode_padding_mode(base64::engine::DecodePaddingMode::Indifferent), +); + +/// Debug formatting. +impl fmt::Debug for FileId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "FileId({})", self.format_hex()) + } +} + +/// Construct a file ID from an unsigned 128 bit integer. +impl From for FileId { + fn from(value: u128) -> Self { + Self(value) + } +} + +/// Get a file ID as a 128 bit integer. +impl From for u128 { + fn from(value: FileId) -> Self { + value.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::testdata; + + #[test] + fn hash_elf() { + let id = FileId::from_path(&testdata("inline")).unwrap(); + assert_eq!(id, 0xc34f3585fca1b579fb458e827851a599.into()); + assert_eq!(id.format_hex(), "c34f3585fca1b579fb458e827851a599"); + assert_eq!(id.format_es(), "w081hfyhtXn7RY6CeFGlmQ"); + } + + #[test] + fn hash_non_elf() { + let zeros = &[0u8; 123]; + let mut cursor = io::Cursor::new(&zeros); + let id = FileId::from_stream(&mut cursor).unwrap(); + assert_eq!(id, 0xf4c1e5fe2f28034fcceb0776ec00b125.into()); + assert_eq!(id.format_hex(), "f4c1e5fe2f28034fcceb0776ec00b125"); + assert_eq!(id.format_es(), "9MHl_i8oA0_M6wd27ACxJQ"); + } +} diff --git a/rust-crates/symblib/src/gosym/errors.rs b/rust-crates/symblib/src/gosym/errors.rs new file mode 100644 index 00000000..43a8afcb --- /dev/null +++ b/rust-crates/symblib/src/gosym/errors.rs @@ -0,0 +1,61 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use super::*; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occur during parsing. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Unable to find gopclntab")] + GopclntabNotFound, + + #[error("Unable to find module data")] + ModuleDataNotFound, + + #[error("Unable to find code section (`.text`)")] + CodeSectionNotFound, + + #[error("Unable to resolve the destination of the gofunc pointer")] + BadGoFuncPtr, + + #[error("Found pointer to invalid memory")] + InvalidPtr, + + #[error("Go symbols section is malformed")] + MalformedGopclntab, + + #[error("Go version is not supported")] + UnsupportedGoVersion, + + #[error("Inline index is malformed")] + BadInlineIndex, + + #[error("File index is malformed")] + BadFileIndex, + + #[error("Line number is malformed")] + BadLineNumber, + + #[error("Unexpected end of file")] + UnexpectedEof, + + #[error("Encountered non-utf8 string")] + NonUtf8String, + + #[error("Variable length integer is too big")] + VarIntTooLong, + + #[error("Unable to read section without copying")] + CannotAvoidCopy, + + #[error("Reader currently doesn't support big endian binaries")] + BigEndian, + + #[error("objfile error: {}", .0)] + Objfile(#[from] objfile::Error), +} diff --git a/rust-crates/symblib/src/gosym/mod.rs b/rust-crates/symblib/src/gosym/mod.rs new file mode 100644 index 00000000..860860e8 --- /dev/null +++ b/rust-crates/symblib/src/gosym/mod.rs @@ -0,0 +1,526 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Library for zero-copy decoding of Go runtime information. +//! +//! The format is documented in detail [here](https://github.com/open-telemetry/opentelemetry-ebpf-profiler/blob/c96e126866/doc/gopclntab.md). + +// If we ever want to support building for 32 bit, we'd need get rid of all +// the `as usize` casts and instead do checked conversions. Removing this +// check would not result in any safety hazards, but parsing of executables +// that don't fit memory of 32 bit machines would fail in quite unintuitive +// ways and debug builds might panic due to integer overflows. +#[cfg(not(target_pointer_width = "64"))] +compile_error!("gosym currently assumes sizeof(usize) == 8"); + +mod errors; +pub use errors::*; +mod raw; + +use crate::{objfile, VirtAddr}; +use fallible_iterator::FallibleIterator; +use std::ops::Range; + +/// Decoder for Go runtime information. +#[derive(Debug)] +pub struct GoRuntimeInfo<'obj> { + header: raw::Header, + text_start: VirtAddr, + func_name_table: raw::FuncNameTable<'obj>, + pc_table: raw::PcTable<'obj>, + cu_table: raw::CuTable<'obj>, + func_table: raw::FuncTable<'obj>, + file_name_table: raw::FileNameTable<'obj>, + func_data: raw::FuncData<'obj>, +} + +impl<'obj> GoRuntimeInfo<'obj> { + /// Locate Go runtime information in the given object file. + pub fn open(obj: &objfile::Reader<'obj>) -> Result { + if !obj.is_little_endian() { + return Err(Error::BigEndian); + } + + // Locate `.gopclntab` section. + let mem = obj.memory_map()?; + let (gopclntab_va, gopclntab) = Self::find_gopclntab(obj, &mem)?; + + // Reader the header. + let header = raw::Header::read(gopclntab)?; + let gopclntab = raw::Reader::new(header, gopclntab_va, gopclntab); + let offsets = raw::HeaderOffsets::new(gopclntab.clone())?; + + // Go versions older than 1.16 are currently not supported. + if header.version < Version::V116 { + return Err(Error::UnsupportedGoVersion); + } + + // Go >= 1.18 uses offsets relative to `go:func.*` for function data + // whereas older Go versions simply emit absolute pointers. + let func_data = if header.version >= Version::V118 { + raw::FuncData::GoFunc(Self::find_gofunc( + obj, + &mem, + gopclntab_va, + header, + &offsets, + )?) + } else { + raw::FuncData::Global(header, mem) + }; + + // Fall back to code section address if the text start isn't + // available from the header. + let text_start = if let Some(start) = offsets.text_start { + start + } else { + obj.load_section(b".text")? + .ok_or(Error::CodeSectionNotFound)? + .virt_addr() + }; + + // Create decoders for the various sub-regions of runtime info. + Ok(Self { + func_data, + text_start, + header, + func_name_table: raw::FuncNameTable::new(&offsets, gopclntab.clone())?, + file_name_table: raw::FileNameTable::new(&offsets, gopclntab.clone())?, + func_table: raw::FuncTable::new(&offsets, gopclntab.clone())?, + pc_table: raw::PcTable::new(&offsets, gopclntab.clone())?, + cu_table: raw::CuTable::new(&offsets, gopclntab)?, + }) + } + + /// Returns the Go runtime data version. + pub fn version(&self) -> Version { + self.header.version + } + + /// Iterate over all top-level functions in the executable. + pub fn funcs<'rt>(&'rt self) -> Result> { + Ok(FuncIter { + rt: self, + iter: self.func_table.index_iter()?, + }) + } +} + +/// Internal helpers. +impl<'obj> GoRuntimeInfo<'obj> { + /// Locate the `go:func.*` memory region. + fn find_gofunc( + obj: &objfile::Reader<'obj>, + mem: &objfile::MemoryMap<'obj>, + gopclntab_va: VirtAddr, + header: raw::Header, + offsets: &raw::HeaderOffsets, + ) -> Result> { + // The region is pointed to only by the module data record, not by + // the normal `.gopclntab` header. Need to locate module data first. + let module_data = Self::find_module_data(obj, mem, gopclntab_va, header, &offsets)?; + + // NOTE: in newer Go versions there can be more than one module data + // record via the `next` field, but I'm not sure when this is + // used. Never seen it filled. Perhaps for Go shared libraries? + + let sec = mem + .section_for_addr(module_data.go_func) + .ok_or(Error::BadGoFuncPtr)?; + let data = sec.as_obj_slice().ok_or(Error::CannotAvoidCopy)?; + let offset = (module_data.go_func - sec.virt_addr()) as usize; + let reader = raw::Reader::new(header, module_data.go_func, &data[offset..]); + + Ok(reader) + } + + /// Locate the `.gopclntab` section. + /// + /// Uses section headers when present and falls back to an heuristic + /// approach that scans for the known portions of the gopclntab header. + fn find_gopclntab( + obj: &objfile::Reader<'obj>, + mem: &objfile::MemoryMap<'obj>, + ) -> Result<(VirtAddr, &'obj [u8])> { + // Try section headers first. + for sec_name in [ + b".gopclntab".as_slice(), + b".data.rel.ro.gopclntab".as_slice(), + ] { + if let Some(sec) = obj.load_section(sec_name)? { + let data = sec.as_obj_slice().ok_or(Error::CannotAvoidCopy)?; + return Ok((sec.virt_addr(), data)); + } + } + + // Infer pointer size and quantum from architecture. + let (ptr_size, quantum) = match obj.arch() { + Some(objfile::Arch::X86_64) => (8, 1), + Some(objfile::Arch::Aarch64) => (8, 4), + None => return Err(Error::GopclntabNotFound), + }; + + // Scan all memory for header signature. + for region in mem { + // Scan with a stride of `ptr_size`: we expect the header + // to be stored aligned to that at the very least. + for (offs, window) in region.windows(8).enumerate().step_by(ptr_size.into()) { + if &window[1..4] != b"\xFF\xFF\xFF" { + continue; + } + if &window[4..] != &[0, 0, quantum, ptr_size] { + continue; + } + if let Err(_) = Version::from_magic(window[..4].try_into().unwrap()) { + continue; + } + + let va = region.virt_addr() + offs as u64; + let slice = region.as_obj_slice().ok_or(Error::CannotAvoidCopy)?; + let gopclntab = &slice[offs..]; + + return Ok((va, gopclntab)); + } + } + + Err(Error::GopclntabNotFound) + } + + /// Locate and parse `runtime.firstmoduledata`. + /// + /// Uses object symbols when present and falls back to an heuristic + /// approach that scans the executable for a known pattern if symbols + /// aren't available. + fn find_module_data( + obj: &objfile::Reader<'_>, + mem: &objfile::MemoryMap<'_>, + gopclntab_va: VirtAddr, + header: raw::Header, + offsets: &raw::HeaderOffsets, + ) -> Result { + // Try via symbol lookup first. This is a `LOCAL` symbol that will + // likely be stripped in most production executables, but it's worth + // a try: the fallback path has to scan a lot of memory. + if let Some(sym) = obj.resolve_symbol("runtime.firstmoduledata") { + let sec = mem + .section_for_addr(sym.virt_addr) + .ok_or(Error::InvalidPtr)?; + let slice = &sec[(sym.virt_addr - sec.virt_addr()) as usize..]; + let reader = raw::Reader::new(header, sym.virt_addr, slice); + return raw::ModuleData::read(reader); + } + + // No luck with symbols. Fall back to locating it via the pointer to + // `.gopclntab` that it always starts with. Approach inspired by what + // Stephen Eckels describes here: + // + // https://www.mandiant.com/resources/blog/golang-internals-symbol-recovery + let needle = &gopclntab_va.to_le_bytes()[..header.ptr_size as usize]; + let expected_funcnametab = offsets.funcname_offset.0.wrapping_add(gopclntab_va); + let expected_cutab = offsets.cutab_offset.0.wrapping_add(gopclntab_va); + + for region in mem { + for (offs, window) in region + .windows(needle.len()) + .enumerate() + .step_by(header.ptr_size as usize) + { + if window != needle { + continue; + } + + let addr = region.virt_addr().wrapping_add(offs as u64); + let reader = raw::Reader::new(header, addr, ®ion[offs..]); + let Ok(candidate) = raw::ModuleData::read(reader) else { + continue; + }; + + // Validate a few fields against gopclntab. + if candidate.funcnametab != expected_funcnametab + || candidate.cutab != expected_cutab + { + continue; + } + + // Looking good! + return Ok(candidate); + } + } + + Err(Error::ModuleDataNotFound) + } +} + +/// Version of the Go runtime information. +/// +/// The data format usually stays the same for multiple Go versions. +/// We thus only list versions where significant changes occurred. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum Version { + /// Go v1.2 - v1.15. + V12, + /// Go 1.16 - v1.17. + V116, + /// Go 1.18 - 1.19. + V118, + /// Go 1.20 - latest as of writing. + V120, +} + +impl std::fmt::Display for Version { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Version::V12 => "v1.2", + Version::V116 => "v1.16", + Version::V118 => "v1.18", + Version::V120 => "v1.20", + }) + } +} + +impl Version { + fn from_magic(magic: [u8; 4]) -> Result { + Ok(match &magic { + b"\xFB\xFF\xFF\xFF" => Version::V12, + b"\xFA\xFF\xFF\xFF" => Version::V116, + b"\xF0\xFF\xFF\xFF" => Version::V118, + b"\xF1\xFF\xFF\xFF" => Version::V120, + _ => return Err(Error::UnsupportedGoVersion), + }) + } +} + +/// Iterator over all top-level functions in the executable. +pub struct FuncIter<'rt, 'obj> { + rt: &'rt GoRuntimeInfo<'obj>, + iter: raw::FuncIndexIter<'obj>, +} + +impl<'rt, 'obj> FallibleIterator for FuncIter<'rt, 'obj> { + type Item = Func<'rt, 'obj>; + type Error = Error; + + fn next(&mut self) -> Result> { + let Some(index_entry) = self.iter.next()? else { + return Ok(None); + }; + + let raw = self.rt.func_table.func(index_entry.funcoff)?; + Ok(Some(Func { rt: self.rt, raw })) + } +} + +/// Top-level function in the executable. +#[derive(Debug)] +pub struct Func<'rt, 'obj> { + rt: &'rt GoRuntimeInfo<'obj>, + raw: raw::Func<'obj>, +} + +impl<'rt, 'obj> Func<'rt, 'obj> { + /// Returns the start address of this function. + pub fn start_addr(&self) -> VirtAddr { + match self.raw.func_pc { + raw::CodePtr::Addr(va) => va, + raw::CodePtr::Offs(offs) => self.rt.text_start.wrapping_add(offs.0), + } + } + + /// Read the function name. + pub fn name(&self) -> Result<&'obj str> { + self.rt.func_name_table.name(self.raw.name) + } + + /// Construct an iterator yielding mappings from PC to file names. + pub fn file_mapping(&self) -> Result> { + Ok(PcFileIter { + rt: self.rt, + pc_base: self.start_addr(), + cu_offset: self.raw.cu_offset, + iter: self.rt.pc_table.pcdata(self.raw.pcfile)?, + }) + } + + /// Construct an iterator yielding mappings from PC to line numbers. + pub fn line_mapping(&self) -> Result> { + Ok(PcLineIter { + pc_base: self.start_addr(), + iter: self.rt.pc_table.pcdata(self.raw.pcln)?, + }) + } + + /// First line of the function definition. + /// + /// Only available for Go >= v1.20. + pub fn start_line(&self) -> Option { + self.raw.start_line + } + + /// Construct an iterator yielding mappings from PC to the deepest inline + /// function in the inline tree. + /// + /// You can then use [`InlinedCall::parent_pc`] to figure out the parents + /// inline function (doing another pass through the inline mapping). + pub fn inline_mapping(&self) -> Result>> { + use raw::{FuncDataField::*, PcDataField::*}; + + let Some(inline_tree) = self.raw.func_data(InlTree) else { + return Ok(None); + }; + let Some(index_pcdata) = self.raw.pc_data(InlTreeIndex) else { + return Ok(None); + }; + + Ok(Some(InlineTreeIter { + rt: self.rt, + inline_tree, + pc_base: self.start_addr(), + iter: self.rt.pc_table.pcdata(index_pcdata)?, + })) + } +} + +/// Iterator over mappings from PCs to the file name +/// that the code was generated from. +#[derive(Debug)] +pub struct PcFileIter<'rt, 'obj> { + rt: &'rt GoRuntimeInfo<'obj>, + pc_base: VirtAddr, + cu_offset: raw::CuTabIndex, + iter: raw::PcDataReader<'obj>, +} + +impl<'rt, 'obj> FallibleIterator for PcFileIter<'rt, 'obj> { + type Item = (Range, Option<&'obj str>); + type Error = Error; + + fn next(&mut self) -> Result> { + let Some((pc_offs, file_ref)) = self.iter.next()? else { + return Ok(None); + }; + + let pc = range_rel2abs(self.pc_base, pc_offs); + + let cu_file_idx = match file_ref { + i32::MIN..=-2 => return Err(Error::BadFileIndex), + -1 => return Ok(Some((pc, None))), + 0..=i32::MAX => raw::CuTabIndex(file_ref as u32), + }; + + let offs = self + .rt + .cu_table + .file_name_offset(self.cu_offset, cu_file_idx)?; + + let file = if offs != raw::FileNameOffset::INVALID { + Some(self.rt.file_name_table.name(offs)?) + } else { + None + }; + + Ok(Some((pc, file))) + } +} + +/// Iterator over mappings from PCs to the line number +/// that the code was generated from. +#[derive(Debug)] +pub struct PcLineIter<'obj> { + pc_base: VirtAddr, + iter: raw::PcDataReader<'obj>, +} + +impl<'obj> FallibleIterator for PcLineIter<'obj> { + type Item = (Range, Option); + type Error = Error; + + fn next(&mut self) -> Result> { + let Some((pc_offs, raw_line)) = self.iter.next()? else { + return Ok(None); + }; + + let pc = range_rel2abs(self.pc_base, pc_offs); + + let line = match raw_line { + i32::MIN..=-2 | 0 => return Err(Error::BadLineNumber), + -1 => None, + 0..=i32::MAX => Some(raw_line as u32), + }; + + Ok(Some((pc, line))) + } +} + +/// Iterator over mappings from PCs to inline calls. +#[derive(Debug)] +pub struct InlineTreeIter<'rt, 'obj> { + rt: &'rt GoRuntimeInfo<'obj>, + inline_tree: raw::FuncDataRef, + pc_base: VirtAddr, + iter: raw::PcDataReader<'obj>, +} + +impl<'rt, 'obj> FallibleIterator for InlineTreeIter<'rt, 'obj> { + type Item = (Range, Option>); + type Error = Error; + + fn next(&mut self) -> Result> { + let Some((pc_offs, tree_idx)) = self.iter.next()? else { + return Ok(None); + }; + + let pc = range_rel2abs(self.pc_base, pc_offs); + + let tree_idx = match tree_idx { + i32::MIN..=-2 => return Err(Error::BadInlineIndex), + -1 => return Ok(Some((pc, None))), + 0..=i32::MAX => raw::InlineTreeIndex(tree_idx as u32), + }; + + let call = InlinedCall { + rt: self.rt, + outer_fn_entry: self.pc_base, + raw: self.rt.func_data.inlined_call(self.inline_tree, tree_idx)?, + }; + + Ok(Some((pc, Some(call)))) + } +} + +/// Represents a function that got inlined into a top-level function. +#[derive(Debug)] +pub struct InlinedCall<'rt, 'obj> { + rt: &'rt GoRuntimeInfo<'obj>, + outer_fn_entry: VirtAddr, + raw: raw::InlinedCall, +} + +impl<'rt, 'obj> InlinedCall<'rt, 'obj> { + /// Read the name of the function that got inlined. + pub fn name(&self) -> Result<&'obj str> { + self.rt.func_name_table.name(self.raw.name_offset) + } + + /// Gets the first line number of the inlined function. + /// + /// Only available for Go >= 1.20. + pub fn start_line(&self) -> Option { + match &self.raw.info { + raw::InlinedCallInfo::New { start_line, .. } => Some(*start_line), + raw::InlinedCallInfo::Old { .. } => None, + } + } + + /// Gets the address of the next higher function in the inline chain. + pub fn parent_pc(&self) -> VirtAddr { + self.outer_fn_entry.wrapping_add(self.raw.parent_pc) + } +} + +/// Makes a relative PC offset range absolute. +fn range_rel2abs(base: VirtAddr, rng: Range) -> Range { + Range { + start: base.wrapping_add(rng.start.0), + end: base.wrapping_add(rng.end.0), + } +} diff --git a/rust-crates/symblib/src/gosym/raw/mod.rs b/rust-crates/symblib/src/gosym/raw/mod.rs new file mode 100644 index 00000000..7ea6c745 --- /dev/null +++ b/rust-crates/symblib/src/gosym/raw/mod.rs @@ -0,0 +1,18 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Provides decoding of the raw data structures. + +mod reader; +mod regions; +mod structs; +mod types; + +// Re-export some stuff that is needed across all `raw` submodules. +use super::{Error, Result, Version}; +use crate::VirtAddr; + +pub use reader::*; +pub use regions::*; +pub use structs::*; +pub use types::*; diff --git a/rust-crates/symblib/src/gosym/raw/reader.rs b/rust-crates/symblib/src/gosym/raw/reader.rs new file mode 100644 index 00000000..ccafc8aa --- /dev/null +++ b/rust-crates/symblib/src/gosym/raw/reader.rs @@ -0,0 +1,266 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Helpers for reading Go's data structures. + +use super::*; +use std::collections::Bound; +use std::ffi::CStr; +use std::ops::RangeBounds; +use std::slice::SliceIndex; + +/// Reader ("cursor") type for reading Go data structures. +#[derive(Clone)] +pub struct Reader<'obj> { + header: Header, + addr: VirtAddr, + data: &'obj [u8], +} + +/// Implements a read function for each given primitive integer type. +macro_rules! impl_read_prim { + ( $($ty:ident),* ) => {$( + #[doc=concat!("Reads the next `", stringify!($ty), "`.")] + pub fn $ty(&mut self) -> Result<$ty> { + let sz = std::mem::size_of::<$ty>(); + + if self.data.len() < sz { + return Err(Error::UnexpectedEof); + } + + let r = self.data[..sz].try_into().unwrap(); + let v = $ty::from_le_bytes(r); + + self.skip(sz); + + Ok(v) + } + )*} +} + +impl<'obj> Reader<'obj> { + /// Create a new reader from a slice and a header. + pub fn new(header: Header, addr: VirtAddr, data: &'obj [u8]) -> Self { + Self { header, addr, data } + } + + /// Creates a new reader for a region within this reader. + pub fn sub_reader(&self, rng: T) -> Result + where + T: Clone + RangeBounds + SliceIndex<[u8], Output = [u8]>, + { + let mut new = self.clone(); + + new.data = new.data.get(rng.clone()).ok_or(Error::UnexpectedEof)?; + + let start_offset = match rng.start_bound() { + Bound::Included(x) => *x, + Bound::Excluded(_) => unreachable!("start bound cannot be excluded"), + Bound::Unbounded => 0, + }; + + new.addr = new.addr.wrapping_add(start_offset as u64); + + Ok(new) + } + + /// Skip `n` bytes. + pub fn skip(&mut self, n: usize) -> &mut Self { + self.data = &self.data[n.min(self.data.len())..]; + self.addr = self.addr.wrapping_add(n as u64); + self + } + + /// Align to next multiple of pointer size. + pub fn align_up(&mut self) -> &mut Self { + let dangling = self.addr as usize % self.ptr_size(); + if dangling != 0 { + self.skip(self.ptr_size() - dangling); + } + self + } + + impl_read_prim!(u8, u32, u64, i16); + + /// Read the next pointer-sized integer. + pub fn uintptr(&mut self) -> Result { + Ok(match self.header.ptr_size { + 4 => self.u32()? as u64, + 8 => self.u64()?, + _ => unreachable!("pre-checked on construction"), + }) + } + + /// Reads the next code pointer. + pub fn code_ptr(&mut self) -> Result { + Ok(if self.version() >= Version::V118 { + CodePtr::Offs(TextStartOffset(self.u32()? as u64)) + } else { + CodePtr::Addr(self.uintptr()?.wrapping_mul(self.quantum() as u64)) + }) + } + + /// Read a zero-terminated string. + pub fn str(&mut self) -> Result<&'obj str> { + let str = CStr::from_bytes_until_nul(self.data) + .map_err(|_| Error::UnexpectedEof)? + .to_str() + .map_err(|_| Error::NonUtf8String)?; + self.skip(str.len() + 1); + Ok(str) + } + + /// Reads a variable-length encoded `u32`. + pub fn var_u32(&mut self) -> Result { + let mut v = 0; + for shift in (0..=31).step_by(7) { + let b = self.u8()? as u32; + v |= (b & 0x7F) << shift; + if b & 0x80 == 0 { + if shift == 4 * 7 && b & 0b0111_0000 != 0 { + return Err(Error::VarIntTooLong); + } + return Ok(v); + } + } + Err(Error::VarIntTooLong) + } + + /// Reads a zig-zag variable-length encoded `i32`. + pub fn var_i32(&mut self) -> Result { + let zigzag = self.var_u32()? as i32; + Ok(-(zigzag & 1) ^ (zigzag >> 1)) + } + + /// Returns true if the reader doesn't have any data left. + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + /// Gets the whole header. + pub fn header(&self) -> Header { + self.header + } + + /// Gets the Go version. + pub fn version(&self) -> Version { + self.header.version + } + + /// Gets the pointer size. + pub fn ptr_size(&self) -> usize { + self.header.ptr_size as usize + } + + /// Gets the code pointer quantum. + pub fn quantum(&self) -> usize { + self.header.quantum as usize + } +} + +/// Custom debug impl to prevent printing huge byte arrays. +impl std::fmt::Debug for Reader<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Reader({} bytes @ {:#08X})", self.data.len(), self.addr) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + static HEADER: Header = Header { + version: Version::V120, + quantum: 1, + ptr_size: 8, + }; + + #[test] + fn read_primitives() { + let data: Vec<_> = (0..=0xff).collect(); + let mut reader = Reader::new(HEADER, 0, &data); + assert_eq!(reader.u32().unwrap(), 0x03020100); + assert_eq!(reader.u8().unwrap(), 0x04); + assert_eq!(reader.skip(3).u64().unwrap(), 0xf0e0d0c0b0a0908); + } + + #[test] + fn sub_reader() { + let data = b"\xAA\xBB\xCC\xDD\xEE\xFF"; + let all = Reader::new(HEADER, 0, data); + + { + let mut sub = all.sub_reader(..2).unwrap(); + assert_eq!(sub.skip(1).u8().unwrap(), 0xBB); + assert!(sub.is_empty()); + } + + { + let mut sub = all.sub_reader(3..4).unwrap(); + assert_eq!(sub.u8().unwrap(), 0xDD); + assert!(sub.is_empty()); + } + + { + let mut sub = all.sub_reader(4..).unwrap(); + assert_eq!(sub.u8().unwrap(), 0xEE); + assert_eq!(sub.u8().unwrap(), 0xFF); + assert!(sub.is_empty()); + } + + for offs in (data.len() - 3)..(data.len() + 20) { + assert!(all.sub_reader(offs..offs + 4).is_err(), "{offs}"); + } + } + + #[test] + fn var_u32() { + let r = |x| Reader::new(HEADER, 0, x).var_u32(); + + assert!(matches!(r(b"\xe5\x8e\xa6"), Err(Error::UnexpectedEof))); + assert!(matches!(r(b""), Err(Error::UnexpectedEof))); + assert!(matches!(r(b"\x95\x9a\xef\x3a"), Ok(123456789))); + assert!(matches!(r(b"\xff\xff\xff\xff\x0f"), Ok(u32::MAX))); + assert!(matches!( + r(b"\xff\xff\xff\xff\x10"), + Err(Error::VarIntTooLong) + )); + + assert!(matches!(r(b"\x00"), Ok(0))); + assert!(matches!(r(b"\x01"), Ok(1))); + assert!(matches!(r(b"\x7f"), Ok(0x7f))); + assert!(matches!(r(b"\x7f"), Ok(127))); + assert!(matches!(r(b"\x80\x01"), Ok(128))); + assert!(matches!(r(b"\x80\x01"), Ok(128))); + assert!(matches!(r(b"\xff\x01"), Ok(255))); + assert!(matches!(r(b"\x80\x02"), Ok(256))); + } + + #[test] + fn var_i32() { + let r = |x| Reader::new(HEADER, 0, x).var_i32(); + + assert!(matches!(r(b"\x00"), Ok(0))); + assert!(matches!(r(b"\x01"), Ok(-1))); + assert!(matches!(r(b"\x02"), Ok(1))); + assert!(matches!(r(b"\x03"), Ok(-2))); + assert!(matches!(r(b"\x04"), Ok(2))); + } + + #[test] + fn str() { + // Valid string + let mut reader = Reader::new(HEADER, 0, b"hello\x00\x11"); + assert_eq!(reader.str().unwrap(), "hello"); + assert_eq!(reader.u8().unwrap(), 0x11); + assert!(reader.is_empty()); + + // Unterminated string + let mut reader = Reader::new(HEADER, 0, b"hello"); + assert!(matches!(reader.str(), Err(Error::UnexpectedEof))); + + // Bad UTF-8 + let mut reader = Reader::new(HEADER, 0, b"\xc3\x28\x00"); + assert!(matches!(reader.str(), Err(Error::NonUtf8String))); + } +} diff --git a/rust-crates/symblib/src/gosym/raw/regions.rs b/rust-crates/symblib/src/gosym/raw/regions.rs new file mode 100644 index 00000000..4b7daf1b --- /dev/null +++ b/rust-crates/symblib/src/gosym/raw/regions.rs @@ -0,0 +1,162 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Reference types for sub-regions of the Go runtime information. +//! +//! Most of these represent sub-regions of the `.gopclntab` section, but +//! there are some exceptions where data lives in other sections. + +use super::*; +use crate::objfile; +use fallible_iterator::FallibleIterator; + +/// Decoder for `.gopclntab`:`runtime.filetab`. +#[derive(Debug)] +pub struct FileNameTable<'obj>(Reader<'obj>); + +impl<'obj> FileNameTable<'obj> { + pub fn new(o: &HeaderOffsets, gopclntab: Reader<'obj>) -> Result { + Ok(Self(gopclntab.sub_reader(o.filetab_offset.0 as usize..)?)) + } + + pub fn name(&self, offset: FileNameOffset) -> Result<&'obj str> { + self.0.sub_reader(offset.0 as usize..)?.str() + } +} + +/// Decoder for `.gopclntab`:`runtime.cutab`. +#[derive(Debug)] +pub struct CuTable<'obj>(Reader<'obj>); + +impl<'obj> CuTable<'obj> { + pub fn new(o: &HeaderOffsets, gopclntab: Reader<'obj>) -> Result { + gopclntab.sub_reader(o.cutab_offset.0 as usize..).map(Self) + } + + pub fn file_name_offset( + &self, + cu_idx: CuTabIndex, + fn_idx: CuTabIndex, + ) -> Result { + let offs = (cu_idx.0 as u64 + fn_idx.0 as u64) * 4; + self.0 + .sub_reader(offs as usize..)? + .u32() + .map(FileNameOffset) + } +} + +/// Decoder for `.gopclntab`:`runtime.funcnametab`. +#[derive(Debug)] +pub struct FuncNameTable<'obj>(Reader<'obj>); + +impl<'obj> FuncNameTable<'obj> { + pub fn new(o: &HeaderOffsets, gopclntab: Reader<'obj>) -> Result { + gopclntab + .sub_reader(o.funcname_offset.0 as usize..) + .map(Self) + } + + pub fn name(&self, offset: FuncNameOffset) -> Result<&'obj str> { + self.0.sub_reader(offset.0 as usize..)?.str() + } +} + +/// Decoder for `.gopclntab`:`runtime.pctab`. +/// +/// Note that while Go calls this a "table" it is actually just a +/// concatenation of `pcdata` sequences (see [`PcDataReader`]). +#[derive(Debug)] +pub struct PcTable<'obj>(Reader<'obj>); + +impl<'obj> PcTable<'obj> { + pub fn new(o: &HeaderOffsets, gopclntab: Reader<'obj>) -> Result { + gopclntab.sub_reader(o.pctab_offset.0 as usize..).map(Self) + } + + pub fn pcdata(&self, offset: PcTabOffset) -> Result> { + self.0 + .sub_reader(offset.0 as usize..) + .map(PcDataReader::new) + } +} + +/// Decoder for `.gopclntab`:`runtime.functab`. +/// +/// `runtime.functab`, in this case, refers to the label with that name that +/// the linker emits, not the structure type with the same name. +#[derive(Debug)] +pub struct FuncTable<'obj> { + reader: Reader<'obj>, + num_funcs: u64, +} + +impl<'obj> FuncTable<'obj> { + pub fn new(o: &HeaderOffsets, gopclntab: Reader<'obj>) -> Result { + Ok(Self { + reader: gopclntab.sub_reader(o.pcln_offset.0 as usize..)?, + num_funcs: o.num_funcs, + }) + } + + pub fn index_iter(&self) -> Result> { + let sz = FuncTabIndexEntry::size_of(self.reader.header()); + let reader = self.reader.sub_reader(..sz * self.num_funcs as usize)?; + Ok(FuncIndexIter(reader)) + } + + pub fn func(&self, offs: FuncTabOffset) -> Result> { + Func::read(self.reader.sub_reader(offs.0 as usize..)?) + } +} + +/// Iterator over the index in `.gopclntab`:`runtime.functab`. +#[derive(Debug)] +pub struct FuncIndexIter<'obj>(Reader<'obj>); + +impl FallibleIterator for FuncIndexIter<'_> { + type Item = FuncTabIndexEntry; + type Error = Error; + + fn next(&mut self) -> Result> { + if self.0.is_empty() { + return Ok(None); + } + + FuncTabIndexEntry::read(&mut self.0).map(Some) + } +} + +/// Decoder for the `go:func.*` region. +#[derive(Debug)] +pub enum FuncData<'obj> { + /// `go:func.*` references are absolute pointers (Go < 1.18). + Global(Header, objfile::MemoryMap<'obj>), + + /// `go:func.*` references are relative to `gofunc` field in module data (Go >= 1.18). + GoFunc(Reader<'obj>), +} + +impl<'obj> FuncData<'obj> { + fn mk_reader(&self, fdref: FuncDataRef) -> Result> { + match (fdref, self) { + (FuncDataRef::Addr(abs), FuncData::Global(header, mem)) => { + let sec = mem.section_for_addr(abs).ok_or(Error::InvalidPtr)?; + let slice = sec.as_obj_slice().ok_or(Error::CannotAvoidCopy)?; + let sub = &slice[abs as usize - sec.virt_addr() as usize..]; + Ok(Reader::new(*header, abs, sub)) + } + (FuncDataRef::Offs(offs), FuncData::GoFunc(gofunc)) => { + Ok(gofunc.sub_reader(offs.0 as usize..)?) + } + _ => unreachable!("bug: invalid addr/offs global/gofunc combination"), + } + } + + pub fn inlined_call(&self, tree: FuncDataRef, idx: InlineTreeIndex) -> Result { + let mut reader = self.mk_reader(tree)?; + let sz = InlinedCall::size_of(reader.header()); + reader.skip(idx.0 as usize * sz); + InlinedCall::read(reader) + } +} diff --git a/rust-crates/symblib/src/gosym/raw/structs.rs b/rust-crates/symblib/src/gosym/raw/structs.rs new file mode 100644 index 00000000..035f621e --- /dev/null +++ b/rust-crates/symblib/src/gosym/raw/structs.rs @@ -0,0 +1,411 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Decoding for Go's runtime data structures. +// +//! Types in this module are all concerned with each decoding a single data +//! structure. If one data structure contains references into another, this +//! is represented as an offset. Chasing offsets and following references +//! is left to the main module. + +use super::*; +use fallible_iterator::FallibleIterator; +use std::ops::Range; + +/// Minimal subset of the gopclntab header (`runtime.pcData`). +/// +/// This contains all the fields that [`Reader`] needs to know how to read Go +/// specific types but not any more. The intention is to keep this type small +/// enough that we can pass it around in a register. +#[derive(Debug, Clone, Copy)] +pub struct Header { + pub version: Version, + pub quantum: u8, + pub ptr_size: u8, +} + +impl Header { + pub fn read(sec: &[u8]) -> Result { + // Check header: 4-byte magic, two zeros, pc quantum, pointer size. + if sec.len() < 16 || sec[4] != 0 || sec[5] != 0 { + return Err(Error::MalformedGopclntab); + } + + let version = Version::from_magic(sec[..4].try_into().unwrap())?; + + // quantum and ptrSize are the same between 1.2, 1.16, and 1.18 + let quantum = sec[6]; + if !matches!(quantum, 1 | 2 | 4) { + return Err(Error::MalformedGopclntab); + } + + let ptr_size = sec[7]; + if !matches!(ptr_size, 4 | 8) { + return Err(Error::MalformedGopclntab); + } + + Ok(Header { + version, + quantum, + ptr_size, + }) + } +} + +/// Rest of the `.gopclntab` header (`runtime.pcData`). +/// +/// Excluding the portion that we already have via [`Header`]. +#[derive(Debug)] +pub struct HeaderOffsets { + pub num_funcs: u64, + #[allow(dead_code)] + pub num_files: u64, + pub text_start: Option, + pub funcname_offset: GopclntabOffset, + pub cutab_offset: GopclntabOffset, + pub filetab_offset: GopclntabOffset, + pub pctab_offset: GopclntabOffset, + pub pcln_offset: GopclntabOffset, +} + +impl HeaderOffsets { + pub fn new(mut r: Reader<'_>) -> Result { + Ok(Self { + num_funcs: r.skip(8).uintptr()?, + num_files: r.uintptr()?, + text_start: if r.version() >= Version::V118 { + Some(r.uintptr()?) + } else { + None + }, + funcname_offset: GopclntabOffset(r.uintptr()?), + cutab_offset: GopclntabOffset(r.uintptr()?), + filetab_offset: GopclntabOffset(r.uintptr()?), + pctab_offset: GopclntabOffset(r.uintptr()?), + pcln_offset: GopclntabOffset(r.uintptr()?), + }) + } +} + +/// Decoder for the `runtime.functab` structure. +/// +/// +/// +#[derive(Debug)] +pub struct FuncTabIndexEntry { + #[allow(dead_code)] + pub entry: CodePtr, + pub funcoff: FuncTabOffset, +} + +impl FuncTabIndexEntry { + pub fn size_of(h: Header) -> usize { + if h.version >= Version::V118 { + 2 * 4 + } else { + h.ptr_size as usize * 2 + } + } + + pub fn read(r: &mut Reader<'_>) -> Result { + Ok(Self { + entry: r.code_ptr()?, + funcoff: FuncTabOffset(if r.version() >= Version::V118 { + r.u32()? as u64 + } else { + r.uintptr()? + }), + }) + } +} + +/// Index in the dynamic PC data array in [`Func`]. +/// +/// +#[allow(dead_code)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PcDataField { + InlTreeIndex = 2, +} + +/// Index in the dynamic func data array in [`Func`]. +/// +/// +#[allow(dead_code)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FuncDataField { + InlTree = 3, +} + +/// Decoder for the `runtime._func` structure. +/// +/// +/// +/// +#[derive(Debug)] +pub struct Func<'obj> { + pub func_pc: CodePtr, + pub name: FuncNameOffset, + pub pcfile: PcTabOffset, + pub pcln: PcTabOffset, + pub cu_offset: CuTabIndex, + pub start_line: Option, + pc_data: Reader<'obj>, + func_data: Reader<'obj>, +} + +impl<'obj> Func<'obj> { + pub fn read(mut r: Reader<'obj>) -> Result { + use Version as V; + + let func_pc = r.code_ptr()?; + let name = FuncNameOffset(r.u32()?); + let pcfile = PcTabOffset(r.skip(4 * 3).u32()?); + let pcln = PcTabOffset(r.u32()?); + let npcdata = r.u32()? as usize; + let cu_offset = CuTabIndex(r.u32()?); + + let start_line = if r.version() >= V::V120 { + Some(r.u32()?) + } else { + None + }; + + r.skip(3); // flags + func ID + pad byte + + let nfuncdata = r.u8()? as usize; + + let pc_data = r.sub_reader(..4 * npcdata)?; + r.skip(4 * npcdata); + + let func_data_offs_sz = if r.version() >= V::V118 { + 4 + } else { + // For older Go versions we need to account for pointer alignment. + r.align_up(); + r.ptr_size() + }; + + Ok(Func { + func_pc, + name, + pcfile, + pcln, + cu_offset, + pc_data, + start_line, + func_data: r.sub_reader(..func_data_offs_sz * nfuncdata)?, + }) + } + + pub fn pc_data(&self, n: PcDataField) -> Option { + let mut r = self.pc_data.sub_reader(4 * n as usize..).ok()?; + r.u32().ok().filter(|x| *x != 0).map(PcTabOffset) + } + + pub fn func_data(&self, n: FuncDataField) -> Option { + if self.func_data.version() >= Version::V118 { + let mut r = self.func_data.sub_reader(4 * n as usize..).ok()?; + let offs = GoFuncOffset(r.u32().ok()?); + if offs != GoFuncOffset::INVALID { + Some(FuncDataRef::Offs(offs)) + } else { + None + } + } else { + let psz = self.func_data.ptr_size(); + let mut r = self.func_data.sub_reader(psz * n as usize..).ok()?; + let addr = r.uintptr().ok()?; + if addr != 0 { + Some(FuncDataRef::Addr(addr)) + } else { + None + } + } + } +} + +/// Decoder for `runtime.inlinedCall` +/// +/// +/// +#[derive(Debug)] +pub struct InlinedCall { + /// Marker for special runtime functions. + #[allow(dead_code)] + pub func_id: FuncId, + + /// Position of an instruction whose source position is the call site (offset from entry) + pub parent_pc: u64, + + /// Offset into `runtime.funcname` for named of called function. + /// + /// The comment in the Go source says it's relative to `pclntab`, + /// but that's clearly incorrect. + pub name_offset: FuncNameOffset, + + /// Version specific data. + pub info: InlinedCallInfo, +} + +impl InlinedCall { + pub fn size_of(h: Header) -> usize { + if h.version >= Version::V120 { + 16 + } else { + 20 + } + } + + fn read_new(mut r: Reader<'_>) -> Result { + let func_id = FuncId(r.u8()?); + let name_offset = FuncNameOffset(r.skip(3 /* pad */).u32()?); + let parent_pc = r.u32()? as u64; + let start_line = r.u32()?; + + Ok(InlinedCall { + func_id, + parent_pc, + name_offset, + info: InlinedCallInfo::New { start_line }, + }) + } + + fn read_old(mut r: Reader<'_>) -> Result { + let parent_idx = r.i16()?; + let func_id = FuncId(r.u8()?); + let file = CuTabIndex(r.skip(1 /* pad */).u32()?); // TODO: i32? + let line = r.u32()?; + let name_offset = FuncNameOffset(r.u32()?); + let parent_pc = r.u32()? as u64; + + Ok(InlinedCall { + func_id, + parent_pc, + name_offset, + info: InlinedCallInfo::Old { + parent_idx, + file, + line, + }, + }) + } + + pub fn read(r: Reader<'_>) -> Result { + if r.version() >= Version::V120 { + Self::read_new(r) + } else { + Self::read_old(r) + } + } +} + +/// Version specific portion of [`InlinedCall`]. +#[derive(Debug)] +pub enum InlinedCallInfo { + New { + /// Line number of start of function (func keyword/TEXT directive). + start_line: u32, + }, + Old { + /// Index of parent in the inline tree, or < 0. + #[allow(dead_code)] + parent_idx: i16, + /// Per-CU file index for inlined call. + #[allow(dead_code)] + file: CuTabIndex, + /// Line number of the call site. + #[allow(dead_code)] + line: u32, + }, +} + +/// Decoder for data from `runtime.moduledata`. +/// +/// +/// +#[derive(Debug)] +pub struct ModuleData { + /// Address of the function name table. + pub funcnametab: VirtAddr, + /// Address of the CU table. + pub cutab: VirtAddr, + /// Start of the `go:func.*` region. + pub go_func: VirtAddr, +} + +impl ModuleData { + /// Read module data from the given reader. + pub fn read(r: Reader<'_>) -> Result { + // offsetof(..) for funcnametab.ptr, cutab.ptr and gofunc fields + let (funcnametab, cutab, go_func); + + match r.version() { + Version::V118 => { + funcnametab = 1 * r.ptr_size(); + cutab = 4 * r.ptr_size(); + go_func = 38 * r.ptr_size(); + } + Version::V120 => { + funcnametab = 1 * r.ptr_size(); + cutab = 4 * r.ptr_size(); + go_func = 40 * r.ptr_size(); + } + _ => return Err(Error::UnsupportedGoVersion), + } + + Ok(ModuleData { + funcnametab: r.sub_reader(funcnametab..)?.uintptr()?, + cutab: r.sub_reader(cutab..)?.uintptr()?, + go_func: r.sub_reader(go_func..)?.uintptr()?, + }) + } +} + +/// Decoder for `pcdata` sequences within [`PcTable`]. +/// +/// +#[derive(Debug)] +pub struct PcDataReader<'obj> { + reader: Option>, + pc_offset: TextStartOffset, + value: i32, + first: bool, +} + +impl<'obj> PcDataReader<'obj> { + pub fn new(r: Reader<'obj>) -> Self { + PcDataReader { + pc_offset: TextStartOffset(0), + value: -1, + first: true, + reader: Some(r), + } + } +} + +impl<'obj> FallibleIterator for PcDataReader<'obj> { + type Item = (Range, i32); + type Error = Error; + + fn next(&mut self) -> Result> { + let Some(reader) = self.reader.as_mut() else { + return Ok(None); + }; + + let uv_delta = reader.var_i32()?; + if uv_delta == 0 && !self.first { + self.reader = None; + return Ok(None); + } + self.value = self.value.wrapping_add(uv_delta); + + let pc_delta = reader.var_u32()? as u64; + let pc_delta_scaled = pc_delta.wrapping_mul(reader.quantum() as u64); + let prev_pc_offs = self.pc_offset; + self.pc_offset.0 = prev_pc_offs.0.wrapping_add(pc_delta_scaled); + + self.first = false; + Ok(Some((prev_pc_offs..self.pc_offset, self.value))) + } +} diff --git a/rust-crates/symblib/src/gosym/raw/types.rs b/rust-crates/symblib/src/gosym/raw/types.rs new file mode 100644 index 00000000..82492d30 --- /dev/null +++ b/rust-crates/symblib/src/gosym/raw/types.rs @@ -0,0 +1,81 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! New-types for offsets and other integer types. +//! +//! In Go's runtime there are many different offset types that each need to be +//! added to a particular base address to calculate the final pointer. We have +//! a separate offset type for each such base address. + +use crate::VirtAddr; + +/// Offset within the `.gopclntab` section. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct GopclntabOffset(pub u64); + +/// Offset within `runtime.funcnametab`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FuncNameOffset(pub u32); + +/// Offset within `runtime.filetab`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FileNameOffset(pub u32); + +impl FileNameOffset { + pub const INVALID: Self = FileNameOffset(u32::MAX); +} + +/// Offset within `runtime.functab`. +/// +/// `u32` in versions >=1.18, `u64` in older ones. We simply widen it to +/// `u64` for all versions. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FuncTabOffset(pub u64); + +/// Offset within `go:func.*`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct GoFuncOffset(pub u32); + +impl GoFuncOffset { + pub const INVALID: Self = GoFuncOffset(u32::MAX); +} + +/// Function data reference, either relative or absolute. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FuncDataRef { + Addr(VirtAddr), + Offs(GoFuncOffset), +} + +/// Offset within `runtime.pctab`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PcTabOffset(pub u32); + +/// Index within `runtime.cutab`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CuTabIndex(pub u32); + +/// Virtual address offset relative to `text_start`. +/// +/// This is usually stored as `u32`, but some code paths accumulate deltas +/// that in sum can then become larger than `u32`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TextStartOffset(pub u64); + +/// Pointer to code, either relative or absolute. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CodePtr { + /// Absolute virtual address (Go versions >= 1.18). + Addr(VirtAddr), + + /// Offset relative to text start (Go versions < 1.18). + Offs(TextStartOffset), +} + +/// Identifier for special internal Go functions. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FuncId(pub u8); + +/// Index within an inline tree. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct InlineTreeIndex(pub u32); diff --git a/rust-crates/symblib/src/lib.rs b/rust-crates/symblib/src/lib.rs new file mode 100644 index 00000000..c4913693 --- /dev/null +++ b/rust-crates/symblib/src/lib.rs @@ -0,0 +1,67 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#![doc = include_str!("../README.md")] +#![warn(missing_docs)] + +pub mod covmap; +pub mod dbglog; +pub mod demangle; +pub mod disas; +pub mod dwarf; +pub mod fileid; +pub mod gosym; +pub mod objfile; +pub mod retpads; +pub mod symbconv; +pub mod symbfile; + +/// Type-erased error type. +/// +/// We primarily use this to hand out errors from third-party libraries where +/// lifting them into distinct error variants didn't make sense because no +/// consumer cares about differentiating between different error variants. +pub type AnyError = Box; + +/// Virtual address in the ELF / mach-O address space. +pub type VirtAddr = u64; + +/// Returns the overlap of two given ranges, or `None` if no overlap. +/// +/// # Examples +/// +/// ``` +/// # use symblib::range_overlap; +/// assert_eq!(range_overlap(&(0..5), &(1..3)), Some(1..3)); +/// assert_eq!(range_overlap(&(0..5), &(5..10)), None); +/// assert_eq!(range_overlap(&(0..5), &(4..10)), Some(4..5)); +/// assert_eq!(range_overlap(&(4..10), &(0..5)), Some(4..5)); // order is irrelevant +/// assert_eq!(range_overlap(&(0..0), &(0..1)), None); // empty ranges can't overlap anything! +/// ``` +pub fn range_overlap( + a: &std::ops::Range, + b: &std::ops::Range, +) -> Option> { + let c = std::ops::Range { + start: a.start.max(b.start), + end: a.end.min(b.end), + }; + + if c.is_empty() { + None + } else { + Some(c) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + /// Construct path for test files living in `./testdata`. + pub fn testdata(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("testdata") + .join(name) + } +} diff --git a/rust-crates/symblib/src/objfile.rs b/rust-crates/symblib/src/objfile.rs new file mode 100644 index 00000000..e3a71863 --- /dev/null +++ b/rust-crates/symblib/src/objfile.rs @@ -0,0 +1,771 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! High-level abstractions for working with large object files. + +use crate::{AnyError, VirtAddr}; + +use std::io::Read as _; +use std::{fmt, fs, io, ops, path}; + +use flate2::read::ZlibDecoder; +use memmap2::{Mmap, MmapMut}; +use object::{ + CompressionFormat, Object as _, ObjectSection as _, ObjectSegment as _, ObjectSymbol as _, +}; +use zstd::stream::read::Decoder as ZstdDecoder; + +/// Length of a GNU build ID. +const BUILD_ID_LEN: usize = 20; + +/// Maximum size of a GNU debug link. +const MAX_DEBUG_LINK_LENGTH: usize = 4096; + +/// Maximum size of an individual object section to keep in memory. +/// +/// All sections where the decompressed representation is larger than this +/// constant are instead read into anonymous temporary files and `mmap`ed. +const SWAP_THRESH: usize = 16 * 1024 * 1024; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occur during object file parsing. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("GNU alt link section is malformed")] + MalformedGnuAltLink, + + #[error("Sections are compressed in an unsupported format")] + UnsupportedCompressionFormat, + + #[error("Section uses an unsupported relocation encoding")] + UnsupportedRelocEncoding, + + #[error("Section uses an unsupported relocation kind")] + UnsupportedRelocKind, + + #[error("Section uses an unsupported relocation target")] + UnsupportedRelocTarget, + + #[error("Section uses an unsupported relocation size")] + UnsupportedRelocSize, + + #[error("Relocation offset is out of bounds for the section")] + OutOfBoundsRelocOffset, + + #[error("Relocation contains an invalid symbol index")] + BadSymbolIndex, + + #[error("Relocation contain an invalid section index")] + BadSectionIndex, + + #[error("Object file is too big to be loaded")] + FileTooBig, + + #[error("IO error")] + IO(#[from] io::Error), + + #[error(transparent)] + Other(AnyError), +} + +/// Conversion of [`object`] errors into ours, with type erasure. +/// +/// We erase the type here to prevent leaking [`object`] library types into our +/// public interface. If code needs to special-case based on particular [`object`] +/// errors, we should instead lift them into custom error variants. +impl From for Error { + fn from(e: object::Error) -> Self { + Self::Other(Box::new(e)) + } +} + +/// Maps an object file or executable into memory. +/// +/// This currently supports ELF and mach-O files. The backing file is `mmap`ed +/// to make reading more efficient. This currently uses the [`object`] library +/// to perform the actual heavy lifting, however this should be considered an +/// implementation detail. +pub struct File(Mmap); + +impl File { + /// Map the file at the given path into memory. + pub fn load(path: &path::Path) -> Result { + Self::load_file(&fs::File::open(path)?) + } + + /// Map the given file into memory. + pub fn load_file(file: &fs::File) -> Result { + Ok(Self(unsafe { Mmap::map(file)? })) + } + + /// Parse the header and create a reader. + pub fn parse(&self) -> Result { + Ok(Reader(object::File::parse(&self.0[..])?)) + } +} + +/// Provides read access to the data in an object file. +/// +/// Created via [`File::parse`]. +pub struct Reader<'obj>(object::File<'obj>); + +impl<'obj> Reader<'obj> { + /// Loads the section with the given name into memory. + /// + /// Depending on whether the section is compressed in the input file or not, + /// this can be an expensive operation. Callers should store and retrieve + /// the returned instance if it is needed more than once. + pub fn load_section(&self, name: &[u8]) -> Result>> { + let Some(obj_sec) = self.0.section_by_name_bytes(name) else { + return Ok(None); + }; + + Section::load_from_obj_section(&obj_sec).map(Some) + } + + /// Like `[Self::load_section]`, but applies relocations if necessary. + /// + /// This currently only supports some basic relocation types that we have + /// seen being applied to DWARF sections in the wild. + pub fn load_section_reloc(&self, name: &[u8]) -> Result>> { + let Some(obj_sec) = self.0.section_by_name_bytes(name) else { + return Ok(None); + }; + + let mut section = Section::load_from_obj_section(&obj_sec)?; + + // Don't apply relocations for executables. For ELF files, this + // corresponds to `ET_EXEC`. We have previously learned the hard + // way [1] that non-relocatable executables will sometimes come + // with relocations that, when applied, will essentially relocate + // the executable twice. + // + // [1]: https://go-review.googlesource.com/c/go/+/327009 + if self.0.kind() == object::ObjectKind::Executable { + return Ok(Some(section)); + } + + // If there are no relocations for this section, we are done here. + if obj_sec.relocations().next().is_none() { + return Ok(Some(section)); + } + + // Make section data mutable so we can apply relocations. + let section_data = section.data.make_mut()?; + + // Apply relocations. + for (offset, reloc) in obj_sec.relocations() { + if reloc.encoding() != object::RelocationEncoding::Generic { + return Err(Error::UnsupportedRelocEncoding); + } + + // `a` corresponds to `A` in `RelocationKind` documentation. + let a = reloc.addend(); + + // `p` corresponds to `P` in `RelocationKind` documentation. + let p = match reloc.kind() { + object::RelocationKind::Absolute => 0, + object::RelocationKind::Relative => section.virt_addr.wrapping_add(offset), + _ => return Err(Error::UnsupportedRelocKind), + }; + + // `s` corresponds to `S` in `RelocationKind` documentation. + let s = match reloc.target() { + object::RelocationTarget::Absolute => 0, + + object::RelocationTarget::Symbol(sym_idx) => { + let Ok(refd_sym) = self.0.symbol_by_index(sym_idx) else { + return Err(Error::BadSymbolIndex); + }; + + refd_sym.address() + } + + object::RelocationTarget::Section(sec_idx) => { + let Ok(refd_sec) = self.0.section_by_index(sec_idx) else { + return Err(Error::BadSectionIndex); + }; + + refd_sec.address() + } + + _ => return Err(Error::UnsupportedRelocTarget), + }; + + // Calculate relocation byte size via ceil division. + let reloc_byte_size = (usize::from(reloc.size()) + 7) / 8; + + let Ok(offset) = usize::try_from(offset) else { + return Err(Error::OutOfBoundsRelocOffset); + }; + + if section_data.len().saturating_sub(offset) < reloc_byte_size { + return Err(Error::OutOfBoundsRelocOffset); + } + + // Create slice for the data to be updated with the relocation. + let reloc_buf = &mut section_data[offset..offset + reloc_byte_size]; + + // The implicit addend is the original value at the location that + // we are relocating. In ELF, this is decided from the section name + // (`rela` => no implicit addend, `rel` => use implicit addend). + let implicit_addend = match (reloc.has_implicit_addend(), reloc.size()) { + (true, 32) => u32::from_le_bytes(reloc_buf.try_into().unwrap()) as u64, + (true, 64) => u64::from_le_bytes(reloc_buf.try_into().unwrap()), + (true, _) => return Err(Error::UnsupportedRelocSize), + (false, _) => 0, + }; + + let relocated = implicit_addend + .wrapping_add(s) + .wrapping_add_signed(a) + .wrapping_sub(p); + + match reloc.size() { + 32 => reloc_buf.copy_from_slice(&(relocated as u32).to_le_bytes()), + 64 => reloc_buf.copy_from_slice(&relocated.to_le_bytes()), + _ => return Err(Error::UnsupportedRelocSize), + } + } + + Ok(Some(section)) + } + + /// Checks whether this file has little-endian byte-order. + pub fn is_little_endian(&self) -> bool { + self.0.is_little_endian() + } + + /// Returns the architecture, or [`None`] if unknown. + pub fn arch(&self) -> Option { + match self.0.architecture() { + object::Architecture::Aarch64 => Some(Arch::Aarch64), + object::Architecture::X86_64 => Some(Arch::X86_64), + _ => None, + } + } + + /// Read the contents of the `.gnu_debugaltlink` section. + pub fn gnu_debug_alt_link(&self) -> Result> { + GnuDebugAltLink::load_from_obj(self) + } + + /// Creates a map of all memory mapped regions of the object file. + pub fn memory_map<'reader>(&'reader self) -> Result> { + // For ELF files, `.segments()` iterates over PT_LOAD program headers. + // Load segments cannot be compressed, so we can always borrow them. + let mut regions = Vec::new(); + for segment in self.0.segments() { + regions.push(Section { + prot: Protection::from_segment_flags(segment.flags()), + virt_addr: segment.address(), + virt_size: segment.size(), + data: SectionData::Borrowed(segment.data()?), + }); + } + + regions.sort_unstable_by_key(|x| x.virt_addr); + + Ok(MemoryMap(regions)) + } + + /// Find a symbol by name. + /// + /// Dynamic symbols are preferred over debug symbols. This currently does + /// a linear search over all symbols. + pub fn resolve_symbol(&self, name: &str) -> Option> { + self.0 + .dynamic_symbols() + .chain(self.0.symbols()) + .find(|sym| sym.name().map_or(false, |x| x == name)) + .map(|sym| Symbol { + name: sym.name().expect("validated in `find` step"), + virt_addr: sym.address(), + length: sym.size(), + }) + } + + /// Iterate over function symbols in this executable. + pub fn function_symbols(&self, source: SymbolSource) -> impl Iterator> { + let iter = match source { + SymbolSource::Debug => self.0.symbols(), + SymbolSource::Dynamic => self.0.dynamic_symbols(), + }; + + iter.filter(|x| x.kind() == object::SymbolKind::Text) + // Dynamic symbols with addr = 0 are imports. Also, compilers + // often generate bogus debug symbol records at 0. + .filter(|x| x.address() != 0) + .filter(|x| x.size() != 0) + .filter_map(|x| { + Some(Symbol { + name: x.name().ok()?, // just skip non-utf8 symbols + virt_addr: x.address(), + length: x.size(), + }) + }) + } +} + +/// Information and raw data of an object file section. +/// +/// This doesn't exactly correspond to an ELF section: we also use it to +/// represent memory regions described in program headers. +#[derive(Debug)] +pub struct Section<'obj> { + virt_addr: VirtAddr, + virt_size: u64, + prot: Option, + data: SectionData<'obj>, +} + +impl<'obj> Section<'obj> { + /// Construction from an [`object::Section`]. + fn load_from_obj_section(obj_sec: &object::Section<'obj, '_>) -> Result { + Ok(Section { + virt_addr: obj_sec.address(), + virt_size: obj_sec.size(), + prot: None, + data: SectionData::load_from_obj_sec(obj_sec)?, + }) + } + + /// Returns the virtual address range of the section. + pub fn va_range(&self) -> ops::Range { + self.virt_addr..self.virt_addr + self.virt_size + } + + /// Returns the virtual address of the first byte of this section. + pub fn virt_addr(&self) -> VirtAddr { + self.virt_addr + } + + /// Returns the virtual size of the section. + /// + /// Can be larger than the actual data, padding must be assumed to be zeroed. + pub fn virt_size(&self) -> u64 { + self.virt_size + } + + /// Returns the protection flags for this memory region. + /// + /// This is only available for sections from [`MemoryMap::iter`]. + pub fn protection(&self) -> Option { + self.prot + } + + /// Tries borrowing the section data as a slice with `'obj` lifetime. + /// + /// This only works for sections where the data is not owned by the + /// section thus has the larger `'obj` lifetime (instead of "`'self`"). + pub fn as_obj_slice(&self) -> Option<&'obj [u8]> { + if let SectionData::Borrowed(slice) = self.data { + Some(slice) + } else { + None + } + } +} + +/// Allow using section objects where slices are expected. +impl<'obj> ops::Deref for Section<'obj> { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match &self.data { + SectionData::Borrowed(x) => x, + SectionData::InMemory(x) => &x[..], + SectionData::Swapped(x) => &x[..], + } + } +} + +/// Storage for object file sections. +pub enum SectionData<'obj> { + /// Section was uncompressed in the input file and we simply kept a ref. + Borrowed(&'obj [u8]), + + /// Section was originally compressed and we decompressed it into memory. + InMemory(Vec), + + /// Section was originally compressed and we decompressed it into a + /// memory-mapped temporary file. + Swapped(MmapMut), +} + +impl<'obj> SectionData<'obj> { + /// Create [`Self::InMemory`] variant from a reader. + fn read_into_memory(final_size: usize, mut reader: impl io::Read) -> Result { + let mut mem_buf = Vec::with_capacity(final_size); + reader.read_to_end(&mut mem_buf)?; + Ok(SectionData::InMemory(mem_buf)) + } + + /// Create [`Self::Swapped`] variant from a reader. + fn read_into_swap(mut reader: impl io::Read) -> Result { + let mut file = tempfile::tempfile()?; + io::copy(&mut reader, &mut file)?; + let mmap = unsafe { MmapMut::map_mut(&file)? }; + Ok(SectionData::Swapped(mmap)) + } + + /// Creates a variant of the [`SectionData`] enum most appropriate for the + /// given size. + /// + /// Uncompressed sections are handed out as a reference whereas compressed + /// ones are either decoded into memory or into `mmap`ed temporary files + /// based on their size. + fn read_smart(final_size: usize, reader: impl io::Read) -> Result { + if final_size >= SWAP_THRESH { + Self::read_into_swap(reader) + } else { + Self::read_into_memory(final_size, reader) + } + } + + /// Load the data from the given [`object::Section`]. + fn load_from_obj_sec(sec: &object::Section<'obj, '_>) -> Result { + let data = sec.compressed_data()?; + + // Ensure that the file fits into memory. + let final_size: usize = data + .uncompressed_size + .try_into() + .map_err(|_| Error::FileTooBig)?; + + let decoder: Box = match data.format { + CompressionFormat::Zlib => Box::new(ZlibDecoder::new(data.data)), + CompressionFormat::Zstandard => Box::new(ZstdDecoder::new(data.data)?), + CompressionFormat::None => return Ok(SectionData::Borrowed(data.data)), + _ => return Err(Error::UnsupportedCompressionFormat), + }; + + // Still here? Compressed section: unpack it. + let decoder = decoder.take(final_size as u64); + Self::read_smart(final_size, decoder) + } + + /// Builds a mutable reference to the section's data (CoW semantics). + /// + /// If the data was previously borrowed, the first call will force a copy; + /// all consecutive calls will re-use the same buffer. + pub fn make_mut(&mut self) -> Result<&mut [u8]> { + let borrowed = match self { + // Fast paths: underlying buffer is writable already. + SectionData::InMemory(x) => return Ok(&mut x[..]), + SectionData::Swapped(x) => return Ok(&mut x[..]), + + // Expensive case: we need to copy. + SectionData::Borrowed(x) => x, + }; + + *self = Self::read_smart(borrowed.len(), borrowed)?; + + self.make_mut() + } +} + +impl<'obj> fmt::Debug for SectionData<'obj> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let (storage, len) = match self { + Self::Borrowed(x) => ("borrowed", x.len()), + Self::InMemory(x) => ("in-memory", x.len()), + Self::Swapped(x) => ("mmapped", x.len()), + }; + + write!(f, "SectionData([{} bytes, {}])", len, storage) + } +} + +/// Represents a GNU build ID. +#[repr(transparent)] +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct GnuBuildId(pub [u8; BUILD_ID_LEN]); + +impl fmt::Debug for GnuBuildId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let hex: String = self.0.iter().map(|x| format!("{x:02X}")).collect(); + f.debug_tuple("GnuBuildId").field(&hex).finish() + } +} + +/// Parsed contents of the `.gnu_debugaltlink` section. +#[derive(Debug, Clone)] +pub struct GnuDebugAltLink { + /// Relative or absolute path to the supplementary debug file. + /// + /// May contain non UTF-8 characters, hence represented as raw bytes. + pub path: Vec, + + /// GNU build ID for the supplementary debug file. + pub build_id: GnuBuildId, +} + +impl GnuDebugAltLink { + fn load_from_obj(obj: &Reader<'_>) -> Result> { + let Some(sec) = obj.load_section(b".gnu_debugaltlink")? else { + return Ok(None); + }; + + let Some(end_of_path) = sec.iter().position(|&x| x == 0) else { + return Err(Error::MalformedGnuAltLink); + }; + if end_of_path > MAX_DEBUG_LINK_LENGTH { + return Err(Error::MalformedGnuAltLink); + } + + let path = sec[..end_of_path].to_owned(); + + let build_id = GnuBuildId( + sec[end_of_path + 1..] + .try_into() + .map_err(|_| Error::MalformedGnuAltLink)?, + ); + + Ok(Some(GnuDebugAltLink { build_id, path })) + } +} + +/// Provides quick lookups from virtual addresses to the corresponding object file region. +#[derive(Debug)] +pub struct MemoryMap<'obj>(Vec>); + +impl<'obj> MemoryMap<'obj> { + /// Finds the section for the given virtual address. + pub fn section_for_addr(&self, addr: VirtAddr) -> Option<&Section<'obj>> { + let idx = match self.0.binary_search_by_key(&addr, |x| x.virt_addr) { + Ok(idx) => idx, + Err(idx) => idx.checked_sub(1)?, + }; + + let region = self.0.get(idx)?; + + if region.virt_size > addr - region.virt_addr { + Some(region) + } else { + None + } + } + + /// Returns a slice for the data at the given address. + /// + /// The returned slice might be shorter than the requested length if the + /// section's virtual size is larger than the data backing it up. In these + /// cases the caller can assume that the remaining bytes are zero. + pub fn slice_for_addr(&self, addr: VirtAddr, length: u64) -> Option<&[u8]> { + let section = self.section_for_addr(addr)?; + let offset = addr - section.virt_addr(); + + if offset.checked_add(length)? > section.virt_size() { + // Outside of virtual section range: indicate via `None`. + return None; + } + + let start = offset as usize; + let end = (start + length as usize).min(section.len()); + + if start >= end { + // Within virtual section range, but no actual data present: + // indicate via empty slice. + return Some(&[]); + } + + Some(§ion[start..end]) + } + + /// Iterate over all memory regions. + pub fn iter(&self) -> std::slice::Iter> { + self.0.iter() + } +} + +/// Allows iterating the memory map via `&my_memory_map`. +impl<'map, 'obj> IntoIterator for &'map MemoryMap<'obj> { + type Item = &'map Section<'obj>; + type IntoIter = std::slice::Iter<'map, Section<'obj>>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// CPU architecture. +#[non_exhaustive] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum Arch { + /// `aarch64` aka `arm64`. + Aarch64, + /// `x86_64` aka `amd64`. + X86_64, +} + +impl Arch { + /// Minimum instruction alignment required by architecture. + pub const fn min_code_align(self) -> u64 { + match self { + Arch::Aarch64 => 4, + Arch::X86_64 => 1, + } + } +} + +/// Specifies an object symbol source. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SymbolSource { + /// Debug symbol table (`.symtab`). + Debug, + + /// Dynamic symbol table (`.dynsym`). + Dynamic, +} + +/// Memory access protection flags. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub struct Protection { + /// Read permissions. + pub r: bool, + /// Write permissions. + pub w: bool, + /// Execute permissions. + pub x: bool, +} + +impl Protection { + fn from_segment_flags(flags: object::SegmentFlags) -> Option { + match flags { + object::SegmentFlags::Elf { p_flags, .. } => Some(Self { + r: p_flags & object::elf::PF_R != 0, + w: p_flags & object::elf::PF_W != 0, + x: p_flags & object::elf::PF_X != 0, + }), + object::SegmentFlags::MachO { maxprot, .. } => Some(Self { + r: maxprot & object::macho::VM_PROT_READ != 0, + w: maxprot & object::macho::VM_PROT_WRITE != 0, + x: maxprot & object::macho::VM_PROT_EXECUTE != 0, + }), + _ => None, + } + } +} + +/// Basic executable function symbol. +#[derive(Debug, Clone)] +pub struct Symbol<'a> { + /// Function name. Might be mangled. + pub name: &'a str, + /// Start address of the function. + pub virt_addr: VirtAddr, + /// Length of the function. + pub length: u64, +} + +impl Symbol<'_> { + /// Constructs the address range for the symbol. + pub fn range(&self) -> ops::Range { + self.virt_addr..self.virt_addr.saturating_add(self.length) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::testdata; + + #[test] + fn arch() { + let obj = File::load(&testdata("inline")).unwrap(); + let reader = obj.parse().unwrap(); + assert_eq!(reader.arch(), Some(Arch::Aarch64)); + } + + #[test] + fn uncompressed_section() { + let obj = File::load(&testdata("inline")).unwrap(); + let reader = obj.parse().unwrap(); + let section = reader.load_section(b".debug_info").unwrap().unwrap(); + assert!(matches!(section.data, SectionData::Borrowed(_))); + assert_eq!(section.virt_addr(), 0); + assert_eq!(section.len(), 0x22c); + + let section = reader.load_section(b".text").unwrap().unwrap(); + assert_eq!( + §ion[..8], + [0xFD, 0x7B, 0xBF, 0xA9, 0xFD, 0x03, 0x00, 0x91] + ); + } + + #[test] + fn compressed_section() { + for file in ["inline-compressed-dwarf", "inline-compressed-dwarf-zstd"] { + let obj = File::load(&testdata(file)).unwrap(); + let reader = obj.parse().unwrap(); + let section = reader.load_section(b".debug_info").unwrap().unwrap(); + assert!(matches!(section.data, SectionData::InMemory(_))); + assert_eq!(section.virt_addr(), 0); + assert_eq!(section.len(), 0x22c); + } + + { + let obj = File::load(&testdata("inline-big-fake-compressed-dwarf")).unwrap(); + let reader = obj.parse().unwrap(); + let section = reader.load_section(b".debug_info").unwrap().unwrap(); + assert!(matches!(section.data, SectionData::Swapped(_))); + assert_eq!(section.virt_addr(), 0); + assert_eq!(section.len(), 16 * 4 * 1024 * 1024); + assert!(section.iter().all(|x| *x == 0x00)); + } + } + + #[test] + fn memory_map() { + let obj = File::load(&testdata("inline")).unwrap(); + let reader = obj.parse().unwrap(); + let mem = reader.memory_map().unwrap(); + + for addr in [0, 0x640, 0x650, 0x944 - 1] { + let load_seg_1 = mem.section_for_addr(addr).unwrap(); + assert_eq!(load_seg_1.virt_addr(), 0); + assert_eq!(load_seg_1.virt_size(), 0x944); + assert_eq!(load_seg_1.len(), 0x944); + assert_eq!(&load_seg_1[0x640..0x644], b"\xFD\x7B\xBF\xA9"); + assert_eq!(mem.slice_for_addr(0x640, 4).unwrap(), b"\xFD\x7B\xBF\xA9"); + } + + assert!(mem.section_for_addr(0x944).is_none()); + assert!(mem.section_for_addr(0x1fdc8 - 1).is_none()); + + for addr in [0x1fdc8, 0x1fdc8 + 0x270, 0x1fdc8 + 0x278 - 1] { + let load_seg_2 = mem.section_for_addr(addr).unwrap(); + assert_eq!(load_seg_2.virt_addr(), 0x1fdc8); + assert_eq!(load_seg_2.virt_size(), 0x278); + assert_eq!(load_seg_2.len(), 0x270); + } + + // check truncation + assert_eq!(mem.slice_for_addr(0x1fdc8 + 0x26c, 0x8).unwrap().len(), 4); + assert_eq!(mem.slice_for_addr(0x1fdc8 + 0x270, 0x8).unwrap().len(), 0); + assert!(mem.slice_for_addr(0x1fdc8 + 0x278, 0x8).is_none()); + } + + #[test] + fn alt_link() { + let obj = File::load(&testdata("inline-split-dwarf")).unwrap(); + let reader = obj.parse().unwrap(); + let alt_link = reader.gnu_debug_alt_link().unwrap().unwrap(); + + #[rustfmt::skip] + assert_eq!( + alt_link.build_id, + GnuBuildId([ + 0x83, 0xFF, 0xD1, 0xE5, 0x5E, 0xB9, 0x9F, 0x9A, 0x41, 0xA0, + 0x77, 0xAD, 0xBC, 0x95, 0x09, 0x96, 0xBF, 0xB7, 0x93, 0x7F, + ]), + ); + + assert_eq!(alt_link.path, b"meow"); + } +} diff --git a/rust-crates/symblib/src/retpads.rs b/rust-crates/symblib/src/retpads.rs new file mode 100644 index 00000000..a336ad4a --- /dev/null +++ b/rust-crates/symblib/src/retpads.rs @@ -0,0 +1,367 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Logic for generating return pads from a range symbfile and the executable. + +use crate::{debug, disas, objfile, symbfile, AnyError, VirtAddr}; +use fallible_iterator::FallibleIterator; +use intervaltree::{Element, IntervalTree}; +use smallvec::{smallvec, SmallVec}; +use std::{io, mem}; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occur during return pad generation. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Unsupported object file architecture")] + UnsupportedArch, + + #[error("Unable to locate code section for disassembly")] + TextSectionNotFound, + + #[error("Symbfile error: {}", .0)] + Symbfile(#[from] symbfile::Error), + + #[error("Objfile error: {}", .0)] + Objfile(#[from] objfile::Error), + + #[error("Disassembler error: {}", .0)] + Disas(#[from] disas::Error), + + #[error(transparent)] + Other(AnyError), +} + +/// Special-casing for the Go start-of-stack function. +/// +/// Go [manually pushes] a return address into their `runtime.goexit` function +/// onto the stack. This cannot be detected by our regular logic that merely +/// scans for call instructions and needs special handling. +/// +/// On both ARM64 and AMD64 this function consists of only two instructions +/// and the manually calculated return address points to the second instruction. +/// +/// [manually pushes]: https://github.com/golang/go/blob/a40404da7/src/runtime/proc.go#L4556 +fn go_stack_start_special_case( + mem: &objfile::MemoryMap<'_>, + decoder: &dyn disas::InstrDecoder, + sub_tree: &Vec, + mut visitor: impl FnMut(symbfile::ReturnPad) -> Result, +) -> Result { + let outer_func = &sub_tree[0]; + + if outer_func.func != "runtime.goexit" { + return Ok(()); + } + + let Some(code) = mem.slice_for_addr(outer_func.elf_va, u64::from(outer_func.length)) else { + debug!("Failed to read runtime.goexit memory"); + return Ok(()); + }; + + let first_insn = match decoder.decode(outer_func.elf_va, code) { + Ok(insn) => insn, + Err(e) => { + debug!("Failed to decode first instruction in runtime.goexit: {e:?}"); + return Ok(()); + } + }; + + visitor(symbfile::ReturnPad { + elf_va: outer_func.elf_va + u64::from(first_insn.length) - 1, + entries: smallvec![symbfile::ReturnPadEntry { + func: outer_func.func.clone(), + file: outer_func.file.clone(), + line: None, + }], + }) +} + +fn process_tree( + mem: &objfile::MemoryMap<'_>, + decoder: &dyn disas::InstrDecoder, + sub_tree: Vec, + mut visitor: impl FnMut(symbfile::ReturnPad) -> Result, +) -> Result<()> { + go_stack_start_special_case(mem, decoder, &sub_tree, &mut visitor)?; + + // Collect return pads by disassembling all relevant code. + let mut ret_pads = Vec::new(); + 'outer: for range in &sub_tree { + if range.depth != 0 { + // The top level (depth = 0) ranges must cover the ranges of all + // children. It is thus unnecessary to inspect the children here. + // In fact, doing so would even be incorrect and result in duplicate + // records to be inserted. + continue; + } + + let Some(code) = mem.slice_for_addr(range.elf_va, range.length as u64) else { + debug!( + "Unable to map {:x?} to code section, skipping.", + range.va_range() + ); + continue; + }; + + use disas::Error as DE; + let mut instr_iter = disas::decode_all(decoder, range.elf_va, code); + while let Some(instr) = match instr_iter.next() { + Ok(x) => x, + Err(DE::TruncatedInstruction(addr) | DE::InvalidInstruction(addr)) => { + debug!("Unable to decode instruction @ {:#08X}", addr); + continue 'outer; + } + Err(other) => return Err(Error::Disas(other)), + } { + if instr.is_call { + let call_va = instr.addr; + let ret_pad_va = instr.addr + instr.length as VirtAddr; + ret_pads.push((call_va, ret_pad_va)); + } + } + } + + // If no return pads were found, we are done here. + if ret_pads.is_empty() { + return Ok(()); + } + + // Construct interval tree to allow for quick lookups of all inline + // levels that belong to our return pads. + let tree = IntervalTree::from_iter(sub_tree.into_iter().map(|rng| Element { + range: rng.va_range(), + value: rng, + })); + + // Look up and emit inline trace for each return pad. + 'outer: for (call_va, ret_pad_va) in ret_pads { + // Use the address of the call instruction to create the trace. + let mut matches: Vec<_> = tree.query_point(call_va).collect(); + + // Need to process matches in ascending depth order. + matches.sort_unstable_by_key(|x| x.value.depth); + + let mut entries = SmallVec::new(); + let mut iter = matches.iter().peekable(); + while let Some(Element { value: cur, .. }) = iter.next() { + let (file, line) = if let Some(Element { value: next, .. }) = iter.peek() { + if cur.depth + 1 != next.depth { + debug!( + "Detected hole in inline chain for call @ {:#08X}, skipping", + call_va + ); + continue 'outer; + } + + // For the first n-1 non-leaf entries, use the call_X fields. + (&next.call_file, next.call_line) + } else { + // For the leaf record, resolve the line using the line table. + (&cur.file, cur.line_number_for_va(call_va)) + }; + + entries.push(symbfile::ReturnPadEntry { + func: cur.func.clone(), + file: file.clone(), + line, + }); + } + + // If we didn't find any matches to construct a trace from, + // then there is no point in writing a record. + if entries.is_empty() { + continue; + } + + // Return pads are stored with a negative offset of 1 to be consistent + // with the non-leaf addresses sent by the host agent. Check `proto/symbfile/symbfile.proto` + // documentation for more information. + visitor(symbfile::ReturnPad { + elf_va: ret_pad_va - 1, + entries, + })?; + } + + Ok(()) +} + +/// Extract return pads by combining the given ranges and the corresponding +/// executable, writing them into the given output stream in the form of a +/// return pad symbfile. +pub fn create_retpad_symbfile( + exec_path: &std::path::Path, + range_reader: impl io::Read, + retpad_writer: impl io::Write, +) -> Result { + let mut writer = symbfile::Writer::new(retpad_writer)?; + + let obj = objfile::File::load(exec_path)?; + let obj = obj.parse()?; + + let ranges = symbfile::Reader::new(range_reader)? + .filter_map(|msg| match msg { + symbfile::Record::Range(range) => Ok(Some(range)), + _other => Ok(None), + }) + .map_err(Error::Symbfile); + + extract_retpads(&obj, ranges, |rp| writer.write(rp).map_err(Error::Symbfile))?; + + writer.finalize()?; + Ok(()) +} + +/// Extract return pads by combining the given range file IO reader and +/// the corresponding executable. +/// +/// The `visitor` callback is invoked for every return pad in the executable. +/// Returning an error will abort further execution and return early. +pub fn extract_retpads( + executable: &objfile::Reader<'_>, + mut ranges: impl FallibleIterator, + mut visitor: impl FnMut(symbfile::ReturnPad) -> Result, +) -> Result { + let mem = executable.memory_map()?; + + // Create type-erased instruction decoder. + let decoder: Box = match executable.arch() { + Some(objfile::Arch::X86_64) => Box::::default(), + Some(objfile::Arch::Aarch64) => Box::::default(), + None => return Err(Error::UnsupportedArch), + }; + + let mut tree_buf = Vec::new(); + while let Some(range) = ranges.next()? { + // The symbfile range files contain the flattened inline tree in + // pre-order depth-first search order. This means that to collect + // all children of a particular sub-tree we simply need to check + // whether the depth field returns to 0 at some point. + if range.depth == 0 && !tree_buf.is_empty() { + let sub_tree = mem::replace(&mut tree_buf, vec![range]); + process_tree(&mem, &*decoder, sub_tree, &mut visitor)?; + } else { + tree_buf.push(range); + } + } + + // Process final batch. + if !tree_buf.is_empty() { + process_tree(&mem, &*decoder, tree_buf, &mut visitor)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::testdata; + use std::fs::File; + use std::io::{Seek as _, SeekFrom}; + + #[test] + fn translation() { + let exec_path = testdata("inline-no-tco"); + let range_symbfile = File::open(testdata("inline-no-tco.ranges.symbfile")).unwrap(); + let mut retpad_symbfile = tempfile::tempfile().unwrap(); + create_retpad_symbfile(&exec_path, range_symbfile, &mut retpad_symbfile).unwrap(); + retpad_symbfile.seek(SeekFrom::Start(0)).unwrap(); + + let mut reader = symbfile::Reader::new(retpad_symbfile).unwrap(); + + // The message order in return pad symbfiles is undefined: read all and sort. + let mut records = Vec::::new(); + while let Some(msg) = reader.read().unwrap() { + records.push(match msg { + symbfile::Record::ReturnPad(pad) => pad, + _ => panic!("unexpected record type"), + }); + } + records.sort_unstable_by_key(|x| x.elf_va); + let mut record_iter = records.iter(); + + // Reference data created by scrolling through disassembly in IDA, + // looking for any calls in functions with DWARF info. + + // .text:000648 6A 00 00 94 BL a + // .text:00064C 00 00 00 90+ ADRL X0, aHello ; "hello!" + + let a_call = record_iter.next().unwrap(); + let inline_file = + Some("/media/share/Development/prodfiler/libpf-rs/testdata/inline.c".to_owned()); + assert_eq!(a_call.elf_va, 0x64c - 1); + assert_eq!(a_call.entries.len(), 1); + assert_eq!(a_call.entries[0].func, "main"); + assert_eq!(a_call.entries[0].file, inline_file.clone()); + assert_eq!(a_call.entries[0].line, Some(39)); + + // .text:000654 F7 FF FF 97 BL .puts + // .text:000658 00 00 80 52 MOV W0, #0 + let puts_call_in_main = record_iter.next().unwrap(); + assert_eq!(puts_call_in_main.elf_va, 0x658 - 1); + assert_eq!(puts_call_in_main.entries.len(), 5); + + assert_eq!(puts_call_in_main.entries[0].func, "main"); + assert_eq!(puts_call_in_main.entries[0].file, inline_file.clone()); + assert_eq!(puts_call_in_main.entries[0].line, Some(40)); + + assert_eq!(puts_call_in_main.entries[1].func, "a_inline"); + assert_eq!(puts_call_in_main.entries[1].file, inline_file.clone()); + assert_eq!(puts_call_in_main.entries[1].line, Some(35)); + + assert_eq!(puts_call_in_main.entries[2].func, "b_inline"); + assert_eq!(puts_call_in_main.entries[2].file, inline_file.clone()); + assert_eq!(puts_call_in_main.entries[2].line, Some(31)); + + assert_eq!(puts_call_in_main.entries[3].func, "c_inline"); + assert_eq!(puts_call_in_main.entries[3].file, inline_file.clone()); + assert_eq!(puts_call_in_main.entries[3].line, Some(27)); + + assert_eq!(puts_call_in_main.entries[4].func, "d_inline"); + assert_eq!(puts_call_in_main.entries[4].file, inline_file.clone()); + assert_eq!(puts_call_in_main.entries[4].line, Some(23)); + + // .text:0007B0 A0 FF FF 97 BL .puts + // .text:0007B4 FD 7B C1 A8 LDP X29, X30, [SP+var_s0],#0x10 + let puts_call_in_d = record_iter.next().unwrap(); + assert_eq!(puts_call_in_d.elf_va, 0x7b4 - 1); + assert_eq!(puts_call_in_d.entries.len(), 1); + assert_eq!(puts_call_in_d.entries[0].func, "d"); + assert_eq!(puts_call_in_d.entries[0].file, inline_file.clone()); + assert_eq!(puts_call_in_d.entries[0].line, Some(7)); + + // .text:0007C8 F6 FF FF 97 BL d + // .text:0007CC FD 7B C1 A8 LDP X29, X30, [SP+var_s0],#0x10 + let d_call_in_c = record_iter.next().unwrap(); + assert_eq!(d_call_in_c.elf_va, 0x7cc - 1); + assert_eq!(d_call_in_c.entries.len(), 1); + assert_eq!(d_call_in_c.entries[0].func, "c"); + assert_eq!(d_call_in_c.entries[0].file, inline_file.clone()); + assert_eq!(d_call_in_c.entries[0].line, Some(11)); + + // .text:00007DC F9 FF FF 97 BL c + // .text:00007E0 FD 7B C1 A8 LDP X29, X30, [SP+var_s0],#0x10 + let c_call_in_b = record_iter.next().unwrap(); + assert_eq!(c_call_in_b.elf_va, 0x7e0 - 1); + assert_eq!(c_call_in_b.entries.len(), 1); + assert_eq!(c_call_in_b.entries[0].func, "b"); + assert_eq!(c_call_in_b.entries[0].file, inline_file.clone()); + assert_eq!(c_call_in_b.entries[0].line, Some(15)); + + // .text:0007F8 F7 FF FF 97 BL b + // .text:0007FC FD 7B C1 A8 LDP X29, X30, [SP+var_s0],#0x10 + let b_call_in_a = record_iter.next().unwrap(); + assert_eq!(b_call_in_a.elf_va, 0x7fc - 1); + assert_eq!(b_call_in_a.entries.len(), 1); + assert_eq!(b_call_in_a.entries[0].func, "a"); + assert_eq!(b_call_in_a.entries[0].file, inline_file.clone()); + assert_eq!(b_call_in_a.entries[0].line, Some(19)); + + assert!(record_iter.next().is_none()); + } +} diff --git a/rust-crates/symblib/src/symbconv/dwarf/mod.rs b/rust-crates/symblib/src/symbconv/dwarf/mod.rs new file mode 100644 index 00000000..260980f1 --- /dev/null +++ b/rust-crates/symblib/src/symbconv/dwarf/mod.rs @@ -0,0 +1,503 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Translates DWARF data into a range symbfile. +//! +//! Input format: DWARF +//! +//! - Inline instance information is stored in a tree structure +//! - Function name, inline hierarchy and inline call file + line +//! - Every tree node can have 0..n PC ranges +//! - Source lines and files are stored in a separate per-CU structure (line-table) +//! +//! Output format: symbfile +//! +//! - Flat list of records +//! - Depth is defined via an integer depth field +//! - Combines line table with inline tree +//! - Each record has only one range +//! - Each range's line table can only refer to a single source file +//! - If an inline instance contains instructions generated from multiple +//! source files, it must be split every time the source file changes + +mod rangetree; + +use self::rangetree::*; +use crate::symbconv::RangeVisitor; +use crate::{debug, demangle, dwarf, range_overlap, symbfile, AnyError, VirtAddr}; +use fallible_iterator::FallibleIterator; +use intervaltree::{Element, IntervalTree}; +use smallvec::SmallVec; +use std::cell::RefCell; +use std::num::NonZeroU64; +use std::ops::Range; +use std::rc::Rc; + +/// Maximum depth of an inline tree. +/// +/// Various parts of this implementation use recursion. If we didn't restrict +/// the inline tree depth, we'd run at risk of running into stack overflows. +const MAX_INLINE_TREE_DEPTH: u64 = 256; + +/// Errors that can occur during symbol translation. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Inline tree depth exceeds the maximum of {}", MAX_INLINE_TREE_DEPTH)] + InlineTreeTooDeep, + + #[error("DWARF error: {}", .0)] + Dwarf(#[from] dwarf::Error), + + #[error("symbfile error: {}", .0)] + Symbfile(#[from] symbfile::Error), + + #[error("visitor error: {}", .0)] + Visitor(#[source] AnyError), + + #[error(transparent)] + Other(AnyError), +} + +/// Result type shorthand. +type Result = std::result::Result; + +/// Maps a VA range to a source file and line. +#[derive(Debug, Clone)] +struct IntermediateLineTableEntry { + pub rng: Range, + pub file: String, + pub line: u64, +} + +/// Intermediate helper struct storing source lines for a VA range. +/// +/// Other than in the final symbfile range struct, the line table in this +/// intermediate format can still contain lines from different source files. +#[derive(Debug)] +struct IntermediateRange<'dwarf, 'units> { + pub info: Rc>, + pub line_table: RefCell>, +} + +impl<'dwarf, 'units> IntermediateRange<'dwarf, 'units> { + pub fn new(info: Rc>) -> Self { + Self { + info, + line_table: RefCell::new(Vec::new()), + } + } +} + +/// Node format for our intermediate range tree. +/// +/// Since DWARF nodes can have multiple ranges associated with them whereas +/// in our tree structure every node represents a single range, we duplicate +/// the node for each DWARF range. Because the multiple ranges of the DWARF +/// root node need to live somewhere as well, a synthetic root node spanning +/// the entire VA space from [`VirtAddr::MIN`] to [`VirtAddr::MAX`] is used. +#[derive(Debug)] +enum Node<'dwarf, 'units> { + /// Synthetic root without any actual data. + SynthRoot, + + /// Range with source-line information. + Range(IntermediateRange<'dwarf, 'units>), +} + +/// Constructs a subroutine tree with empty line tables. +fn collect_subroutine_tree<'dwarf, 'units>( + sub: dwarf::Subprogram<'dwarf, 'units>, +) -> Result>> { + let mut root = RangeTree { + range: VirtAddr::MIN..VirtAddr::MAX, + value: Node::SynthRoot, + children: Vec::with_capacity(8), + }; + + let mut sub_iter = sub.into_iter(); + while let Some(mut i) = sub_iter.next()? { + let Some(mut ranges) = i.take_ranges() else { + continue; + }; + if i.depth() > MAX_INLINE_TREE_DEPTH { + return Err(Error::InlineTreeTooDeep); + } + + // Insert one node for each range. + let i = Rc::new(i); + while let Some(range) = ranges.next()? { + if range.start <= 1 { + continue; + } + + let Some(container) = root.find_match_at_depth_mut(i.depth(), range.clone()) else { + continue; + }; + + container.children.push(RangeTree { + range, + value: Node::Range(IntermediateRange::new(i.clone())), + children: Vec::with_capacity(8), + }); + } + } + + root.sort(); + + Ok(root) +} + +/// Checks whether any child has line info. +fn any_child_has_lines(node: &RangeTree>) -> bool { + if let Node::Range(imr) = &node.value { + if !imr.line_table.borrow().is_empty() { + return true; + } + } + + node.children.iter().any(any_child_has_lines) +} + +fn process_subroutine( + unit_line_table: &IntervalTree, Option)>, + sub: dwarf::Subprogram<'_, '_>, + mut visitor: impl FnMut(symbfile::Range) -> Result, +) -> Result { + let tree = collect_subroutine_tree(sub)?; + + if tree.children.is_empty() { + return Ok(()); + } + + // Use the top-level view of the tree to assign line records. + for node in tree.collect_top_level_ranges() { + // Skip synthetic root nodes. + let RangeTreeRef { + range, + value: Node::Range(imr), + .. + } = node + else { + continue; + }; + + let mut im_linetab = imr.line_table.borrow_mut(); + for line_record in unit_line_table.query(range.clone()) { + let (ref file, Some(line)) = line_record.value else { + // Skip records without line/file info. + continue; + }; + + // Restrict range to the overlapping region with our node. + let Some(overlap) = range_overlap(&line_record.range, &range) else { + continue; + }; + + im_linetab.push(IntermediateLineTableEntry { + rng: overlap, + file: file.to_string(), + line: line.get(), + }); + } + + im_linetab.sort_unstable_by_key(|x| x.rng.start); + im_linetab.dedup_by(|a, b| { + let same_range = a.rng.start == b.rng.start; + let same_line = a.line == b.line && a.file == b.file; + same_range || same_line + }); + } + + // With the line numbers assigned, now emit the ranges in symbfile format. + for node in tree.iter_dfs() { + let imr = match &node.value { + Node::Range(imr) => imr, + Node::SynthRoot => continue, + }; + + // If the function doesn't have a name, we can't really do anything + // useful with it in symbolization. Skip. + let Some(name) = imr.info.name()? else { + continue; + }; + + let mut record = symbfile::Range { + elf_va: node.range.start, + length: (node.range.end - node.range.start) as _, + func: demangle::demangle(&name).into_owned(), + file: None, + call_file: imr.info.call_file()?.map(|x| x.to_string()), + call_line: imr.info.call_line().map(|x| x.get() as u32), + depth: imr.info.depth() as _, + line_table: SmallVec::new(), + }; + + let line_table = imr.line_table.borrow(); + + for lte in line_table.iter() { + if let Some(prev_file) = &record.file { + // We should probably also split for holes in the line table + // that aren't covered by inline instances, but computing this + // is unfortunately rather expensive. + + if prev_file != <e.file { + // File changed: split record. + let mut clone = record.clone(); + clone.length = (lte.rng.start - record.elf_va) as u32; + visitor(clone)?; + + record.elf_va = lte.rng.start; + record.length = (node.range.end - lte.rng.start) as u32; + record.line_table.clear(); + record.file = Some(lte.file.clone()); + } + } else { + record.file = Some(lte.file.clone()); + } + + record.line_table.push(symbfile::LineTableEntry { + offset: (lte.rng.start - record.elf_va) as _, + line_number: lte.line as _, + }); + } + + if !any_child_has_lines(node) { + continue; + } + + visitor(record)?; + } + + Ok(()) +} + +fn process_unit( + unit: dwarf::Unit<'_, '_>, + mut visitor: impl FnMut(symbfile::Range) -> Result, +) -> Result { + // If the line table is empty, we can't do anything useful with this unit. Skip. + let Some(line_iter) = unit.line_iter() else { + return Ok(()); + }; + + // Construct an interval tree for fast lookups. We unfortunately have + // to first collect it into a vector, then move it into the interval + // tree because `line_iter` is a fallible iterator which cannot be used + // to construct an interval tree directly. + let line_table = IntervalTree::from_iter( + line_iter + .filter(|x| Ok(!x.rng.is_empty())) + .map(|x| { + Ok(Element { + range: x.rng, + value: (x.file, x.line), + }) + }) + .collect::>()?, + ); + + // Process all subroutines in the unit. + let mut sr_iter = unit.subprograms(); + while let Some(routine) = sr_iter.next()? { + process_subroutine(&line_table, routine, &mut visitor)?; + } + + Ok(()) +} + +/// DWARF translation statistics. +#[derive(Debug, Default, PartialEq, Eq)] +pub struct Stats { + /// Number of units that were successfully processed. + pub units_ok: u64, + + /// Number of units that had to be skipped due to parsing issues. + /// + /// This includes units that were partially processed but encountered + /// errors halfway through. + pub units_broken: u64, +} + +/// Extract address ranges and their source-file mapping from the given DWARF +/// sections. +fn extract_ranges( + dw: &dwarf::Sections<'_>, + mut visitor: impl FnMut(symbfile::Range) -> Result, +) -> Result { + let units = dw.units()?; + let mut unit_iter = units.iter(); + let mut stats = Stats::default(); + + loop { + let unit = match unit_iter.next() { + Ok(Some(unit)) => unit, + Ok(None) => break, + Err(e) => { + debug!("Skipping unit with broken header: {:?}", e); + stats.units_broken += 1; + continue; + } + }; + + eprintln!("Processing {:?}", &unit); + match process_unit(unit, &mut visitor) { + Ok(()) => stats.units_ok += 1, + Err(e) => { + debug!("Aborted unit processing due to error: {:?}", e); + stats.units_broken += 1; + } + } + } + + Ok(stats) +} + +/// Extract symbol ranges from DWARF debug info. +pub struct Extractor<'dw, 'obj>(&'dw dwarf::Sections<'obj>); + +impl<'dw, 'obj> Extractor<'dw, 'obj> { + /// Create a new extractor. + pub fn new(dw: &'dw dwarf::Sections<'obj>) -> Self { + Self(dw) + } +} + +impl<'dw, 'obj> super::RangeExtractor for Extractor<'dw, 'obj> { + fn extract(&self, visitor: RangeVisitor<'_>) -> super::Result> { + let visitor_adapter = |range| visitor(range).map_err(Error::Visitor); + extract_ranges(self.0, visitor_adapter) + .map(|x| Some(super::Stats::Dwarf(x))) + .map_err(super::Error::Dwarf) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + dwarf, objfile, + symbconv::{RangeExtractor as _, Stats as SymbconvStats}, + symbfile::{self, LineTableEntry}, + tests::testdata, + }; + use std::io::{Seek, SeekFrom}; + + #[test] + fn inline() { + let obj = objfile::File::load(&testdata("inline")).unwrap(); + let obj = obj.parse().unwrap(); + let dwarf = dwarf::Sections::load(&obj).unwrap(); + let mut out_file = tempfile::tempfile().unwrap(); + let extr = Extractor::new(&dwarf); + let stats = extr.extract_to_symbfile(&mut out_file).unwrap(); + + assert!(matches!( + &stats, + Some(SymbconvStats::Dwarf(Stats { + units_ok: 1, + units_broken: 0, + })), + )); + + out_file.seek(SeekFrom::Start(0)).unwrap(); + let mut reader = symbfile::Reader::new(out_file).unwrap(); + + let mut rng: symbfile::Range = reader.read().unwrap().unwrap().unwrap_range(); + let src_file = "/media/share/Development/prodfiler/libpf-rs/testdata/inline.c"; + assert_eq!(rng.elf_va, 0x640); + assert_eq!(rng.length, 0x664 - 0x640); + assert_eq!(rng.func, "main"); + assert_eq!(rng.file.unwrap(), src_file,); + assert_eq!(rng.call_line, None); + assert_eq!(rng.call_file, None); + assert_eq!(rng.depth, 0); + + // `symbtool dwarf -e inline dump` excerpt: + // + // [0x000640..0x000640) /media/share/Development/prodfiler/libpf-rs/testdata/inline.c:38 + // [0x000640..0x000640) /media/share/Development/prodfiler/libpf-rs/testdata/inline.c:39 + // [0x000640..0x000648) /media/share/Development/prodfiler/libpf-rs/testdata/inline.c:38 + // [0x000648..0x00064C) /media/share/Development/prodfiler/libpf-rs/testdata/inline.c:39 + // [...] (covered by inline instances) + // [0x000658..0x000664) /media/share/Development/prodfiler/libpf-rs/testdata/inline.c:41 + + assert_eq!( + &rng.line_table[..], + &[ + LineTableEntry { + offset: 0x640 - 0x640, + line_number: 38, + }, + LineTableEntry { + offset: 0x648 - 0x640, + line_number: 39, + }, + LineTableEntry { + offset: 0x658 - 0x640, + line_number: 41, + }, + ] + ); + + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.elf_va, 0x64c); + assert_eq!(rng.length, 0x658 - 0x64c); + assert_eq!(rng.depth, 1); + assert_eq!(rng.func, "a_inline"); + assert_eq!(rng.call_file.unwrap(), src_file); + assert_eq!(rng.call_line.unwrap(), 40); + assert!(rng.line_table.is_empty()); + + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.elf_va, 0x64c); + assert_eq!(rng.length, 0x658 - 0x64c); + assert_eq!(rng.depth, 2); + assert_eq!(rng.func, "b_inline"); + assert!(rng.line_table.is_empty()); + + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.elf_va, 0x64c); + assert_eq!(rng.length, 0x658 - 0x64c); + assert_eq!(rng.depth, 3); + assert_eq!(rng.func, "c_inline"); + assert!(rng.line_table.is_empty()); + + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.elf_va, 0x64c); + assert_eq!(rng.length, 0x658 - 0x64c); + assert_eq!(rng.depth, 4); + assert_eq!(rng.func, "d_inline"); + assert_eq!( + &rng.line_table[..], + &[LineTableEntry { + offset: 0, + line_number: 23, + }] + ); + + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.elf_va, 0x7c0); + assert_eq!(rng.length, 0x7c4 - 0x7c0); + assert_eq!(rng.depth, 0); + assert_eq!(rng.func, "a"); + assert_eq!( + &rng.line_table[..], + &[LineTableEntry { + offset: 0, + line_number: 19, + }] + ); + + // All same schema as `a` above: no need to repeat everything. + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.func, "b"); + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.func, "c"); + rng = reader.read().unwrap().unwrap().unwrap_range(); + assert_eq!(rng.func, "d"); + + assert!(reader.read().unwrap().is_none()); + } +} diff --git a/rust-crates/symblib/src/symbconv/dwarf/rangetree.rs b/rust-crates/symblib/src/symbconv/dwarf/rangetree.rs new file mode 100644 index 00000000..e80d3152 --- /dev/null +++ b/rust-crates/symblib/src/symbconv/dwarf/rangetree.rs @@ -0,0 +1,267 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use crate::VirtAddr; +use std::collections::VecDeque; +use std::{iter, ops}; + +/// References a sub-range of a tree node. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RangeTreeRef<'tree, T> { + /// Sub-range being referenced. Not empty. + pub range: ops::Range, + + /// Depth of the referenced node within the tree. + pub depth: usize, + + /// Reference to the value of the node. + pub value: &'tree T, +} + +/// Specialized range tree structure. +/// +/// All children in each node must: +/// - not overlap with any other child +/// - not be empty +/// - be fully covered by the parent range +/// - be sorted ascending by range (start, end) +#[derive(Debug, Clone)] +pub struct RangeTree { + /// Range covered by this node. Must not be empty. + pub range: ops::Range, + + /// Value associated with this node. + pub value: T, + + /// List of child nodes. + /// + /// All children must be sub-ranges of [`Self::range`] and cannot + /// overlap each other. The list must be sorted ascending by range + /// start. Ranges that start at the same offset must be sorted by + /// ascending range end. + pub children: Vec>, +} + +impl RangeTree { + /// Recursively sort children to arrive at the required ordering guarantees. + pub fn sort(&mut self) { + self.children + .sort_unstable_by_key(|x| (x.range.start, x.range.end)); + + for child in &mut self.children { + child.sort(); + } + } + + /// Collects a flat list of the most specific nodes covering each range. + /// + /// The most specific node is the deepest child node covering the range. If + /// you imagine the tree to be represented as a flame graph projected into + /// 3D space, this would essentially represent the view from the top. + /// + /// For example, the tree + /// + /// ```text + /// Depth + /// 2 ┃ [ malloc ][ strcpy ] + /// 1 ┃ [ strdup ] [ puts ] + /// 0 ┃ [ main ] + /// ━━╋━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━┷━━━━━ VA + /// 0x000 0x100 0x200 + /// ``` + /// + /// would result in this result to be returned: + /// + /// ```text + /// strdup strdup main + /// ↓ ↓ ↓ + /// [ ][ malloc ][ strcpy ][ ][main][ puts ][ ] + /// ━━╋━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━┷━━━━━ VA + /// 0x000 0x100 0x200 + /// ``` + pub fn collect_top_level_ranges(&self) -> Vec> { + let mut out = Vec::with_capacity(128); + self.collect_top_level_ranges_rec(&mut out, 0); + out + } + + fn collect_top_level_ranges_rec<'tree>( + &'tree self, + out: &mut Vec>, + depth: usize, + ) { + out.push(RangeTreeRef { + range: self.range.clone(), + depth, + value: &self.value, + }); + + for child in &self.children { + let prev = out.last_mut().unwrap(); + debug_assert!(prev.range.end >= prev.range.start); + debug_assert!(prev.range.end >= child.range.end); + debug_assert!(child.range.start >= prev.range.start); + + // Truncate previous node: the child takes precedence. + prev.range.end = child.range.start; + + // If the truncation caused the previous node to be empty, get rid of it. + if prev.range.is_empty() { + out.pop(); + } + + // Recurse into children. + child.collect_top_level_ranges_rec(out, depth + 1); + + // If the child doesn't fully cover our range to end, insert ourself again. + let prev = out.last().unwrap(); + if self.range.end > prev.range.end { + out.push(RangeTreeRef { + range: prev.range.end..self.range.end, + depth, + value: &self.value, + }); + } else { + debug_assert_eq!(self.range.end, prev.range.end); + } + } + } + + /// Finds the matching tree node at the given depth. + pub fn find_match_at_depth_mut( + &mut self, + at_depth: u64, + rng: ops::Range, + ) -> Option<&mut RangeTree> { + fn is_sub_range(outer: &ops::Range, sub: &ops::Range) -> bool { + sub.start >= outer.start && sub.end <= outer.end + } + + if at_depth == 0 { + return if is_sub_range(&self.range, &rng) { + Some(self) + } else { + None + }; + } + + let mut node = self; + 'outer: for depth in 1..=at_depth { + for child in &mut node.children { + if is_sub_range(&child.range, &rng) { + node = child; + + if depth == at_depth { + return Some(node); + } + + continue 'outer; + } + } + + break; + } + + None + } + + /// Iterate over the tree's items, in depth-first order. + pub fn iter_dfs(&self) -> impl Iterator { + let mut queue = VecDeque::from([self]); + + iter::from_fn(move || { + let node = queue.pop_back()?; + queue.extend(node.children.iter().rev()); + Some(node) + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_tree() -> RangeTree { + RangeTree { + range: 0x100..0x1000, + value: 1, + children: vec![ + RangeTree { + range: 0x200..0x300, + value: 2, + children: vec![], + }, + RangeTree { + range: 0x400..0x700, + value: 3, + children: vec![], + }, + ], + } + } + + #[test] + fn fn_tree_node() { + let tree = make_test_tree(); + + let expected = [ + RangeTreeRef { + range: 0x100..0x200, + depth: 0, + value: &1, + }, + RangeTreeRef { + range: 0x200..0x300, + depth: 1, + value: &2, + }, + RangeTreeRef { + range: 0x300..0x400, + depth: 0, + value: &1, + }, + RangeTreeRef { + range: 0x400..0x700, + depth: 1, + value: &3, + }, + RangeTreeRef { + range: 0x700..0x1000, + depth: 0, + value: &1, + }, + ]; + + let flat = tree.collect_top_level_ranges(); + assert_eq!(flat.len(), expected.len()); + for (actual, expected) in iter::zip(flat, expected) { + assert_eq!(actual, expected); + } + } + + #[test] + fn find_match_at_depth() { + let mut tree = make_test_tree(); + + let node = tree.find_match_at_depth_mut(0, 0x30..0x50); + assert!(node.is_none()); + + let node = tree.find_match_at_depth_mut(0, 0x110..0x150).unwrap(); + assert_eq!(node.value, 1); + + let node = tree.find_match_at_depth_mut(1, 0x210..0x250).unwrap(); + assert_eq!(node.value, 2); + + let node = tree.find_match_at_depth_mut(0, 0x200..0x300).unwrap(); + assert_eq!(node.value, 1); + + let node = tree.find_match_at_depth_mut(1, 0x200..0x300).unwrap(); + assert_eq!(node.value, 2); + + let node = tree.find_match_at_depth_mut(3, 0x200..0x300); + assert!(node.is_none()); + + let node = tree.find_match_at_depth_mut(0, 0x310..0x330).unwrap(); + assert_eq!(node.value, 1); + } +} diff --git a/rust-crates/symblib/src/symbconv/go.rs b/rust-crates/symblib/src/symbconv/go.rs new file mode 100644 index 00000000..25c379e0 --- /dev/null +++ b/rust-crates/symblib/src/symbconv/go.rs @@ -0,0 +1,113 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Translates Go symbols into symbfile ranges. +//! +//! This is currently still very basic and doesn't support inline functions +//! or constructing line tables. + +use crate::{gosym, objfile, symbfile, AnyError}; +use fallible_iterator::FallibleIterator as _; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occurr during Go symbol extraction. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("go metadata parsing error: {0}")] + Gosym(#[from] gosym::Error), + + #[error("visitor returned an error: {0}")] + Visitor(#[source] AnyError), +} + +/// Go symbol extraction statistics. +#[derive(Debug, Default)] +pub struct Stats { + /// Functions that we succeeded extracting symbols from. + pub funcs_ok: u64, + + /// Functions that we had to skip due to parsing errors. + pub funcs_skipped: u64, + + /// Whether the executable was detected to be a Go executable. + pub is_go_binary: bool, +} + +/// `.gopclntab` symbol extractor. +pub struct Extractor<'obj>(&'obj objfile::Reader<'obj>); + +impl<'obj> Extractor<'obj> { + /// Create a new extractor. + pub fn new(obj: &'obj objfile::Reader<'obj>) -> Self { + Extractor(obj) + } +} + +impl<'obj> super::RangeExtractor for Extractor<'obj> { + fn extract(&self, visitor: super::RangeVisitor<'_>) -> super::Result> { + extract_ranges(self.0, visitor) + .map(|x| Some(super::Stats::Go(x))) + .map_err(super::Error::Go) + } +} + +fn extract_ranges(obj: &objfile::Reader<'_>, visitor: super::RangeVisitor<'_>) -> Result { + let mut stats = Stats::default(); + + let go = match gosym::GoRuntimeInfo::open(obj) { + Ok(x) => x, + Err(gosym::Error::GopclntabNotFound) => return Ok(stats), + Err(other) => return Err(other.into()), + }; + + stats.is_go_binary = true; + + let mut func_iter = go.funcs()?; + while let Some(func) = func_iter.next()? { + // Infer end of function from line tables. + let Some(end) = func.line_mapping()?.map(|(rng, _)| Ok(rng.end)).max()? else { + eprintln!( + "WARN: unable to determine end of function ({})", + func.name()? + ); + stats.funcs_skipped += 1; + continue; + }; + + let length = end.saturating_sub(func.start_addr()); + if length == 0 { + eprintln!("WARN: zero function length ({})", func.name()?); + stats.funcs_skipped += 1; + continue; + } + + // Pick first file and hope for the best. So far this has worked for + // all samples that I've looked at. Even in the presence of inline + // functions there will always be a prologue that has the file of the + // outer function assigned to it. + let file = func + .file_mapping()? + .find_map(|(_, name)| Ok(name.map(|x| x.to_owned())))?; + + let range = symbfile::Range { + elf_va: func.start_addr(), + length: length as _, + func: func.name()?.to_owned(), + file, + call_file: None, + call_line: None, + depth: 0, + line_table: Default::default(), + }; + + visitor(range).map_err(Error::Visitor)?; + + stats.funcs_ok += 1; + } + + Ok(stats) +} diff --git a/rust-crates/symblib/src/symbconv/mod.rs b/rust-crates/symblib/src/symbconv/mod.rs new file mode 100644 index 00000000..4708a842 --- /dev/null +++ b/rust-crates/symblib/src/symbconv/mod.rs @@ -0,0 +1,87 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Extract symbol info and convert it to [`symbfile`] format. + +use crate::{objfile, symbfile, AnyError}; +use std::io; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occurr during symbol extraction. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("symbfile: {0}")] + Symbfile(#[from] symbfile::Error), + + #[error("objfile: {0}")] + Objfile(#[from] objfile::Error), + + #[error("obj sym extraction: {0}")] + Obj(#[source] AnyError), + + #[error("DWARF: {0}")] + Dwarf(#[from] dwarf::Error), + + #[error("multi extractor: {0}")] + Multi(#[from] multi::Error), + + #[error("Go: {0}")] + Go(#[from] go::Error), +} + +/// Callback processing ranges. +pub type RangeVisitor<'a> = &'a mut dyn FnMut(symbfile::Range) -> Result<(), AnyError>; + +/// Extractor-specific statistics collected during symbol extraction. +#[derive(Debug)] +pub enum Stats { + /// Go symbol extractor statistics. + Go(go::Stats), + /// DWARF symbol extractor statistics. + Dwarf(dwarf::Stats), + /// Multi symbol extractor statistics. + Multi(multi::Stats), +} + +/// Common interface for all range extractor. +/// +/// A range extractor is a component that can extract mappings from address +/// ranges to symbols (function name, file name, line number, etc.). +pub trait RangeExtractor { + /// Extract address ranges and their source-file mappings. + /// + /// The `visitor` callback is invoked for every range extracted from the + /// executable. Returning an error will abort further execution and return + /// early. + /// + /// Implementations that support inline function extraction must make sure + /// that inline ranges (depth > 0) always immediately follow after the + /// top-level (depth = 0) range. + fn extract(&self, visitor: RangeVisitor<'_>) -> Result>; + + /// Extract address ranges and their source-file mappings and write them to + /// an IO writer in range symbfile format. + /// + /// The caller should pass a buffered writer for performance reasons. + fn extract_to_symbfile(&self, out: &mut dyn io::Write) -> Result> { + let mut out = symbfile::Writer::new(out)?; + let mut visitor = |range| { + out.write(range) + .map_err(|x| AnyError::from(Error::Symbfile(x))) + }; + let stats = self.extract(&mut visitor)?; + out.finalize()?; + Ok(stats) + } +} + +fn _assert_obj_safe(_: &dyn RangeExtractor) {} + +pub mod dwarf; +pub mod go; +pub mod multi; +pub mod obj; diff --git a/rust-crates/symblib/src/symbconv/multi.rs b/rust-crates/symblib/src/symbconv/multi.rs new file mode 100644 index 00000000..c88a459a --- /dev/null +++ b/rust-crates/symblib/src/symbconv/multi.rs @@ -0,0 +1,329 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Extract and combine symbols from multiple sources. + +use super::{RangeExtractor, RangeVisitor}; +use crate::covmap::{CovMap, SegmentedCovMap}; +use crate::{objfile, range_overlap, VirtAddr}; +use std::num::NonZeroU64; +use std::ops; + +/// Result type shorthand. +pub type Result = std::result::Result; + +/// Errors that can occurr during extraction from multiple sources. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("object file has multiple overlapping code sections")] + OverlappingCodeSections, + + #[error("all range extractors failed: {0:?}")] + AllExtractorsFailed(Box>), + + #[error("objfile: {0}")] + Objfile(#[from] objfile::Error), +} + +/// Per extractor statistics. +#[derive(Debug)] +pub struct PerExtractorStats { + /// Name of the extractor. + pub name: String, + + /// Total number of ranges produced. + pub ranges_produced: u64, + + /// Number of ranges that made it into the output file. + pub ranges_accepted: u64, + + /// Number of ranges that were rejected because they were buggy or already covered. + pub ranges_rejected: u64, + + /// Number of inline children not correctly following their parent root. + pub unexpected_inline_children: u64, + + /// Extractor specific statistics (on success). + pub stats: Option, + + /// Extractor error (in case of failure). + pub error: Option, +} + +/// Combined statistics from all inner extractors. +#[derive(Debug, Default)] +pub struct Stats { + /// Number of extractors that completed successful. + pub extractors_succeeded: u64, + + /// Number of extractors that exited prematurely. + pub extractors_failed: u64, + + /// Detailed per-extractor statistics. + pub per_extractor: Vec, +} + +impl Stats { + /// Sum of all ranges accepted from all extractors. + pub fn total_ranges_accepted(&self) -> u64 { + self.per_extractor.iter().map(|x| x.ranges_accepted).sum() + } +} + +/// Extractor that combines the outputs from multiple other extractors. +/// +/// The output of multiple range extractors are combined by keeping a +/// coverage map of ranges that were already emitted by previous extractors, +/// dropping any duplicate ranges. Extractors added earlier take precedence +/// over extractors added later. +pub struct Extractor<'inner> { + inner: Vec<(String, Box)>, + cov_map_scale: u64, + code_sections: Vec>, +} + +impl<'inner> Extractor<'inner> { + /// Create a new multi range extractor. + pub fn new(obj: &objfile::Reader<'_>) -> Result { + Ok(Self { + inner: vec![], + cov_map_scale: obj.arch().map_or(1, |x| x.min_code_align()), + code_sections: obj + .memory_map()? + .iter() + .filter(|region| region.protection().map_or(false, |p| p.x)) + .map(objfile::Section::va_range) + .collect(), + }) + } + + /// Add a range extractor. + /// + /// Earlier entries take precedence over later ones. + pub fn add( + &mut self, + name: impl Into, + extr: impl RangeExtractor + Send + 'inner, + ) -> &mut Self { + self.inner.push((name.into(), Box::new(extr))); + self + } +} + +impl RangeExtractor for Extractor<'_> { + fn extract(&self, visitor: RangeVisitor<'_>) -> super::Result> { + let mut cov_map = SegmentedCovMap::new(); + let mut stats = Stats::default(); + let scale = NonZeroU64::new(self.cov_map_scale).expect("buggy coverage map scale"); + + for sec in &self.code_sections { + cov_map + .add_segment(CovMap::with_scale(scale, sec.clone())) + .map_err(|_| super::Error::Multi(Error::OverlappingCodeSections))?; + } + + for (name, extractor) in &self.inner { + let per_extr_stats = run_extractor(&mut cov_map, name.clone(), &**extractor, visitor); + + if per_extr_stats.error.is_some() { + stats.extractors_failed += 1; + } else { + stats.extractors_succeeded += 1; + } + + stats.per_extractor.push(per_extr_stats); + } + + if stats.extractors_succeeded == 0 && stats.extractors_failed > 0 { + let errors: Vec<_> = stats + .per_extractor + .into_iter() + .filter_map(|x| Some((x.name, x.error?))) + .collect(); + + let error = Error::AllExtractorsFailed(Box::new(errors)); + return Err(super::Error::Multi(error)); + } + + Ok(Some(super::Stats::Multi(stats))) + } +} + +fn run_extractor( + cov_map: &mut SegmentedCovMap, + name: String, + extractor: &dyn RangeExtractor, + visitor: RangeVisitor<'_>, +) -> PerExtractorStats { + let mut ranges_produced = 0; + let mut ranges_rejected = 0; + let mut ranges_accepted = 0; + let mut unexpected_inline_children = 0; + + // Tracks the last accepted top-level (depth = 0) range. Used to also accept + // the inline records following after the root even if the top-level + // function already marked the whole range as covered. + let mut accept_inline_for: Option> = None; + + let extr_result = extractor.extract(&mut |rng| { + ranges_produced += 1; + + // Start of new top-level function? Consult coverage map. + if rng.depth == 0 { + if cov_map.range_partially_covered(rng.va_range()) { + ranges_rejected += 1; + accept_inline_for = None; + } else { + ranges_accepted += 1; + accept_inline_for = Some(rng.va_range()); + cov_map.add_range(rng.va_range()); + visitor(rng)?; + } + + return Ok(()); + } + + // Inline children: accept if we previously accepted the corresponding + // top-level function range from this extractor. + if let Some(accept_range) = &accept_inline_for { + if range_overlap(accept_range, &rng.va_range()) != Some(rng.va_range()) { + ranges_rejected += 1; + unexpected_inline_children += 1; + return Ok(()); + } + + ranges_accepted += 1; + visitor(rng)?; + } else { + ranges_rejected += 1; + } + + Ok(()) + }); + + let (stats, error) = match extr_result { + Ok(x) => (x, None), + Err(e) => (None, Some(e)), + }; + + PerExtractorStats { + name: name.clone(), + ranges_produced, + ranges_accepted, + ranges_rejected, + unexpected_inline_children, + stats, + error, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::testdata; + use crate::{symbconv, symbfile}; + use std::ops; + + #[test] + fn empty() { + let obj = objfile::File::load(&testdata("inline")).unwrap(); + let obj = obj.parse().unwrap(); + let multi = Extractor::new(&obj).unwrap(); + let stats = multi.extract(&mut |_| Ok(())).unwrap().unwrap(); + + let symbconv::Stats::Multi(stats) = stats else { + panic!("unexpected stats type produced"); + }; + + assert!(stats.per_extractor.is_empty()); + assert_eq!(stats.extractors_failed, 0); + assert_eq!(stats.extractors_succeeded, 0); + } + + struct MockExtractor(Vec<(/* depth */ u32, ops::Range)>); + + impl RangeExtractor for MockExtractor { + fn extract(&self, visitor: RangeVisitor<'_>) -> symbconv::Result> { + for (depth, va_range) in &self.0 { + visitor(symbfile::Range { + elf_va: va_range.start, + length: (va_range.end - va_range.start).try_into().unwrap(), + func: "some_func".to_string(), + file: Some("some_file".to_string()), + call_file: None, + call_line: None, + depth: *depth, + line_table: Default::default(), + }) + .unwrap(); + } + + Ok(None) + } + } + + #[test] + fn multi() { + let obj = objfile::File::load(&testdata("inline")).unwrap(); + let obj = obj.parse().unwrap(); + let code_sec = obj.load_section(b".text").unwrap().unwrap(); + let code_va = code_sec.virt_addr(); + + let extr1 = MockExtractor(vec![ + (0, code_va + 0x10..code_va + 0x20), // A + (1, code_va + 0x1A..code_va + 0x1F), // A1 + (2, code_va + 0x1C..code_va + 0x1F), // A2 + (2, code_va + 0xCC..code_va + 0xDD), // A3 (buggy inline range covered by root) + (0, code_va + 0x30..code_va + 0x40), // B + (0, code_va + 0x3A..code_va + 0x60), // C (partial overlap with B) + (1, code_va + 0x40..code_va + 0x41), // C1 + ]); + + let extr2 = MockExtractor(vec![ + (0, code_va + 0x10..code_va + 0x20), // D (full overlaps A) + (0, code_va + 0x50..code_va + 0x70), // E + (0, code_va + 0x32..code_va + 0x3B), // F (partial overlap with B) + ]); + + let mut multi = Extractor::new(&obj).unwrap(); + multi.add("extr1", extr1); + multi.add("extr2", extr2); + + let mut emitted_ranges = Vec::new(); + let stats = multi + .extract(&mut |rng| { + emitted_ranges.push(rng); + Ok(()) + }) + .unwrap() + .unwrap(); + + let symbconv::Stats::Multi(stats) = stats else { + panic!("unexpected stats type produced"); + }; + + assert_eq!(stats.extractors_succeeded, 2); + assert_eq!(stats.extractors_failed, 0); + + let stats1 = &stats.per_extractor[0]; + assert_eq!(stats1.name, "extr1"); + assert_eq!(stats1.ranges_accepted, 4); + assert_eq!(stats1.ranges_produced, 7); + assert_eq!(stats1.ranges_rejected, 3); + assert_eq!(stats1.unexpected_inline_children, 1); + + let stats2 = &stats.per_extractor[1]; + assert_eq!(stats2.name, "extr2"); + assert_eq!(stats2.ranges_accepted, 1); + assert_eq!(stats2.ranges_produced, 3); + assert_eq!(stats2.ranges_rejected, 2); + assert_eq!(stats2.unexpected_inline_children, 0); + + assert_eq!( + emitted_ranges.len() as u64, + stats1.ranges_accepted + stats2.ranges_accepted + ); + } +} diff --git a/rust-crates/symblib/src/symbconv/obj.rs b/rust-crates/symblib/src/symbconv/obj.rs new file mode 100644 index 00000000..e28ea38b --- /dev/null +++ b/rust-crates/symblib/src/symbconv/obj.rs @@ -0,0 +1,45 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Translates object file (e.g. ELF) symbols into a range symbfile. + +use super::{Error, RangeExtractor, RangeVisitor, Result, Stats}; +use crate::{demangle, objfile, symbfile}; + +/// Extracts ranges from object file symbols. +pub struct Extractor<'obj> { + obj: &'obj objfile::Reader<'obj>, + source: objfile::SymbolSource, +} + +impl<'obj> Extractor<'obj> { + /// Create a new object file symbol extractor. + pub fn new(obj: &'obj objfile::Reader<'obj>, source: objfile::SymbolSource) -> Self { + Self { obj, source } + } +} + +impl<'obj> RangeExtractor for Extractor<'obj> { + fn extract(&self, visitor: RangeVisitor<'_>) -> Result> { + for sym in self.obj.function_symbols(self.source) { + let rng = obj_symbol_to_range(&sym); + visitor(rng).map_err(Error::Obj)?; + } + + Ok(None) + } +} + +/// Translate an object file symbol to a range. +fn obj_symbol_to_range(sym: &objfile::Symbol<'_>) -> symbfile::Range { + symbfile::Range { + elf_va: sym.virt_addr, + length: sym.length as u32, + func: demangle::demangle(sym.name).into_owned(), + file: None, + call_file: None, + call_line: None, + depth: 0, + line_table: Default::default(), + } +} diff --git a/rust-crates/symblib/src/symbfile/mod.rs b/rust-crates/symblib/src/symbfile/mod.rs new file mode 100644 index 00000000..b810e0c9 --- /dev/null +++ b/rust-crates/symblib/src/symbfile/mod.rs @@ -0,0 +1,191 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Implements the `symbfile` file format. +//! +//! For documentation about the file format, please refer to `proto/symbfile/symbfile.proto`. + +pub mod proto; +pub mod read; +pub mod records; +mod strdedup; +pub mod write; + +// Re-export core types on the main module. +pub use read::Reader; +pub use records::*; +pub use write::Writer; + +/// Magic that every valid symbfile must start with. +const FILE_MAGIC: &[u8; 8] = b"symbfile"; + +/// Size of the persistent read/write buffer for protobuf messages. +const MSG_BUF_CAPACITY: usize = 4096; + +/// Maximum size of an individual message. +const MAX_MSG_SIZE: u32 = 16 * 1024 * 1024; // 16 MiB + +/// Maximum size of the string table before flushing it. +/// +/// Must be smaller than [`MAX_MSG_SIZE`]. +const STRING_TABLE_SIZE_FLUSH_THRESH: u32 = MAX_MSG_SIZE - 64 * 1024; + +/// Maximum size of the internal message buffer in the writer. +/// +/// This impacts how many messages can use the same string table before +/// being written out. +const WRITER_MSG_BUFFER_SIZE: usize = 64 * 1024 * 1024; // 64 MiB + +/// Result type used throughout this module. +pub type Result = std::result::Result; + +/// Errors that can occur when reading or writing `symbfile`s. +#[non_exhaustive] +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Line-table is not sorted by `offset`")] + LineTableNotSorted, + + #[error("File does not start with the expected magic")] + InvalidMagic, + + #[error("Expected message type {:?} but got {:?}", .expected, .actual)] + UnexpectedType { + expected: proto::MessageType, + // None = not known by our protobuf definition + actual: Option, + }, + + #[error("File kind with ID {} is unsupported by this implementation", .0)] + UnsupportedKind(i32), + + #[error("Message type value is invalid")] + InvalidMessageType, + + #[error("Message contains an invalid string table reference")] + InvalidStringTableIndex, + + #[error("Message is missing a required field")] + MissingRequiredField(&'static str), + + #[error("File ended prematurely in the middle of a message")] + TruncatedMessage, + + #[error("Variable-length integer is too big")] + VarIntTooLong, + + #[error("Message of size {} exceeds maximum of {}", .0, MAX_MSG_SIZE)] + MaximumMsgSizeExceeded(u64), + + #[error("Not all arrays in a columnar struct-of-arrays have the same length")] + ColumnLengthMismatch, + + #[error("Encountered relative value without an absolut value preceding it")] + RelativeValueWithoutReference, + + #[error("IO error")] + IO(#[from] std::io::Error), + + #[error("Encoding error")] + Encoding(#[from] prost::EncodeError), + + #[error("Decoding error")] + Decoding(#[from] prost::DecodeError), +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_range(seed: u64) -> Range { + Range { + elf_va: seed * 12, + length: (seed % 1234) as u32, + func: match seed % 2 { + 0 => "main".to_owned(), + 1 => "strlen".to_owned(), + _ => unreachable!(), + }, + file: match seed % 3 { + 0 => Some("main.c".to_owned()), + 1 => None, + 2 => Some("/usr/libc/slen.c".to_owned().to_owned()), + _ => unreachable!(), + }, + call_file: match seed % 3 { + 0 => None, + 1 => Some("foo.h".to_owned()), + 2 => Some("bar.c".to_owned()), + _ => unreachable!(), + }, + call_line: match seed % 133 { + 11 | 22 | 33 => None, + other => Some(other as u32 + 1), + }, + depth: (seed % 3) as u32, + line_table: (0..seed % 31) + .map(|i| LineTableEntry { + offset: (seed + i * (seed % 3)) as u32, + line_number: (i * (seed % 7) + 1) as u32, + }) + .collect(), + } + } + + fn make_test_retpad(seed: u64) -> ReturnPad { + ReturnPad { + elf_va: (0x130 * seed) as u64, + entries: (0..seed % 31 + 1) + .map(|i| { + let file = match (seed ^ i) % 3 { + 0 => "main.c", + 1 => "hello.cc", + 2 => "blah.go", + _ => unreachable!(), + }; + + let func = match (seed ^ i) % 4 { + 0 => "main", + 1 => "print_hello", + 2 => "handle_error", + 3 => "do_something", + _ => unreachable!(), + }; + + ReturnPadEntry { + func: func.to_owned(), + file: Some(file.to_owned()), + line: Some((i * (seed % 7)) as u32 + 1), + } + }) + .collect(), + } + } + + #[test] + fn round_trip() { + let msgs: Vec<_> = (0..1000) + .map(|i| { + if (!i) % 3 > 0 { + Record::Range(make_test_range(i)) + } else { + Record::ReturnPad(make_test_retpad(i)) + } + }) + .collect(); + + let mut writer = Writer::new(Vec::new()).unwrap(); + for msg in &msgs { + writer.write(msg.clone()).unwrap(); + } + let buf = writer.finalize().unwrap(); + + let mut reader = Reader::new(&buf[..]).unwrap(); + let mut expected_iter = msgs.iter(); + while let Some(msg) = reader.read().unwrap() { + let expected = expected_iter.next().unwrap(); + assert_eq!(&msg, expected); + } + } +} diff --git a/rust-crates/symblib/src/symbfile/proto.rs b/rust-crates/symblib/src/symbfile/proto.rs new file mode 100644 index 00000000..2d023c01 --- /dev/null +++ b/rust-crates/symblib/src/symbfile/proto.rs @@ -0,0 +1,9 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Raw protobuf message definitions. + +#![allow(missing_docs)] + +// Simply include protobuf definitions generated by `build.rs`. +include!(concat!(env!("OUT_DIR"), "/symbfile.rs")); diff --git a/rust-crates/symblib/src/symbfile/read.rs b/rust-crates/symblib/src/symbfile/read.rs new file mode 100644 index 00000000..fce9b2d1 --- /dev/null +++ b/rust-crates/symblib/src/symbfile/read.rs @@ -0,0 +1,315 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Implements a reader for the `symbfile` file format. + +use super::*; +use crate::symbfile::proto::MessageType; +use crate::VirtAddr; +use fallible_iterator::FallibleIterator; +use std::io; + +/// Reader for the `symbfile` file format. +#[derive(Debug)] +pub struct Reader { + inner: I, + read_buf: Vec, + string_table: Vec, + prev_elf_va: Option, +} + +/// Calculate the absolute ELF VA from context and updating instructions. +/// +/// This is a macro because despite having exactly the same signature, the +/// `ElfVa` types in return pads and ranges are distinct auto-generated types, +/// so a regular function couldn't abstract over both of them. +macro_rules! elf_va_rel2abs { + ($this:expr, $record:expr) => {{ + let abs = match $record.elf_va { + Some(ElfVa::SetElfVa(abs)) => abs, + Some(ElfVa::DeltaElfVa(rel)) => $this + .prev_elf_va + .ok_or(Error::RelativeValueWithoutReference)? + .wrapping_add_signed(rel), + None => return Err(Error::MissingRequiredField("elf_va")), + }; + + $this.prev_elf_va = Some(abs); + + abs + }}; +} + +impl Reader { + /// Create a new reader. + /// + /// It's strongly advised to pass a buffered reader. + pub fn new(mut inner: I) -> Result { + // Check magic. + let mut magic = [0u8; FILE_MAGIC.len()]; + inner.read_exact(&mut magic)?; + if &magic != FILE_MAGIC { + return Err(Error::InvalidMagic); + } + + // Read and validate header. + let mut read_buf = Vec::with_capacity(MSG_BUF_CAPACITY); + let Some((kind, len)) = read_msg_prefix(&mut inner)? else { + return Err(Error::TruncatedMessage); + }; + + if kind != Some(MessageType::MtHeader) { + return Err(Error::UnexpectedType { + expected: MessageType::MtHeader, + actual: kind, + }); + } + + let _: proto::Header = read_msg(&mut inner, len, &mut read_buf)?; + + Ok(Self { + inner, + read_buf, + string_table: Vec::new(), + prev_elf_va: None, + }) + } + + /// Read the next record from the symbfile. + pub fn read(&mut self) -> Result> { + loop { + let Some((kind, len)) = read_msg_prefix(&mut self.inner)? else { + return Ok(None); + }; + + match kind { + Some(MessageType::MtInvalid | MessageType::MtHeader) => { + return Err(Error::InvalidMessageType); + } + + Some(MessageType::MtRangeV1) => { + let raw: proto::RangeV1 = read_msg(&mut self.inner, len, &mut self.read_buf)?; + let parsed = self.deserialize_range(raw)?; + return Ok(Some(Record::Range(parsed))); + } + + Some(MessageType::MtReturnPadV1) => { + let raw: proto::ReturnPadV1 = + read_msg(&mut self.inner, len, &mut self.read_buf)?; + let parsed = self.deserialize_return_pad(raw)?; + return Ok(Some(Record::ReturnPad(parsed))); + } + + Some(MessageType::MtStringTableV1) => { + let msg: proto::StringTableV1 = + read_msg(&mut self.inner, len, &mut self.read_buf)?; + self.string_table = msg.strings; + } + + // Skip unsupported messages. + #[allow(unreachable_patterns)] + Some(_) | None => continue, + } + } + } + + /// Convert a range in wire format into our higher-level format. + pub fn deserialize_range(&mut self, range: proto::RangeV1) -> Result { + let line_table = if let Some(lt) = range.line_table { + if lt.offset.len() != lt.line_number.len() { + return Err(Error::ColumnLengthMismatch); + } + + let mut prev_offset = 0; + lt.line_number + .into_iter() + .zip(lt.offset) + .map(|(l, o)| { + let offset = prev_offset + o; + prev_offset += o; + + LineTableEntry { + offset, + line_number: l, + } + }) + .collect() + } else { + Default::default() + }; + + use proto::range_v1::{CallFile, ElfVa, File, Func}; + Ok(Range { + elf_va: elf_va_rel2abs!(self, range), + length: range.length as u32, + func: match range.func { + Some(Func::FuncRef(idx)) => self.str_by_idx(idx)?.to_owned(), + Some(Func::FuncStr(s)) => s, + None => return Err(Error::MissingRequiredField("func")), + }, + file: match range.file { + Some(File::FileRef(idx)) => Some(self.str_by_idx(idx)?.to_owned()), + Some(File::FileStr(s)) => Some(s), + None => None, + }, + call_file: match range.call_file { + Some(CallFile::CallFileRef(idx)) => Some(self.str_by_idx(idx)?.to_owned()), + Some(CallFile::CallFileStr(s)) => Some(s), + None => None, + }, + call_line: if range.call_line == 0 { + None + } else { + Some(range.call_line) + }, + depth: range.depth, + line_table, + }) + } + + /// Convert a return pad in wire format into our higher-level format. + fn deserialize_return_pad(&mut self, pad: proto::ReturnPadV1) -> Result { + if pad.file.len() != pad.func.len() || pad.file.len() != pad.line.len() { + return Err(Error::ColumnLengthMismatch); + } + + use proto::return_pad_v1::ElfVa; + Ok(ReturnPad { + elf_va: elf_va_rel2abs!(self, pad), + entries: pad + .file + .into_iter() + .zip(pad.func) + .zip(pad.line) + .map(|((file, func), line)| { + let file = self.str_by_idx(file)?.to_owned(); + + Ok(ReturnPadEntry { + func: self.str_by_idx(func)?.to_owned(), + file: if file.is_empty() { None } else { Some(file) }, + line: if line == 0 { None } else { Some(line) }, + }) + }) + .collect::>()?, + }) + } + + /// Retrieve the given string table entry via its index. + fn str_by_idx(&self, idx: u32) -> Result<&str> { + let s = self + .string_table + .get(idx as usize) + .ok_or(Error::InvalidStringTableIndex)? + .as_str(); + Ok(s) + } +} + +/// Allow using the reader as an iterator. +impl FallibleIterator for Reader { + type Item = Record; + type Error = Error; + + fn next(&mut self) -> Result, Self::Error> { + self.read() + } +} + +/// Reads the var-int encoded message length and type. +fn read_msg_prefix(mut read: impl io::Read) -> Result, u32)>> { + let Some(length) = read_leb128(&mut read)? else { + // EOF is fine here: the file ended after the previous message. + return Ok(None); + }; + + if length > u64::from(MAX_MSG_SIZE) { + return Err(Error::MaximumMsgSizeExceeded(length)); + } + + let Some(raw_kind) = read_leb128(&mut read)? else { + return Err(Error::TruncatedMessage); + }; + + let kind: i32 = raw_kind.try_into().map_err(|_| Error::InvalidMessageType)?; + + Ok(Some((MessageType::try_from(kind).ok(), length as u32))) +} + +/// Reads a protobuf message from the input stream, +/// using `buf` as a temporary buffer for decoding. +fn read_msg( + mut read: impl io::Read, + length: u32, + buf: &mut Vec, +) -> Result { + buf.resize(length as usize, 0); + + match read.read_exact(&mut buf[..]) { + Ok(_) => (), + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Err(Error::TruncatedMessage), + Err(e) => return Err(Error::IO(e)), + }; + + buf.shrink_to(MSG_BUF_CAPACITY); + Ok(M::decode(&buf[..])?) +} + +/// Read an ULEB-128 encoded variable-length integer. +/// +/// If EOF is reached before reading the first byte, `Ok(None)` is returned. +/// If EOF is encountered in the middle of an incomplete var-int sequence, +/// a corresponding IO error is returned. +fn read_leb128(mut read: impl io::Read) -> Result> { + let mut result = 0; + let mut shift = 0; + let mut buf = [0u8]; + + for i in 0..10 { + match read.read_exact(&mut buf) { + Ok(_) => (), + Err(e) if i == 0 && e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(Error::IO(e)), + } + + result |= ((buf[0] & 0x7F) as u64) << shift; + + if buf[0] & 0x80 == 0 { + return Ok(Some(result)); + } + + shift += 7; + } + + Err(Error::VarIntTooLong) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn leb128() { + // Test cases ported from Tim's Go implementation. + assert!(matches!( + read_leb128(&[0xE5, 0x8E, 0xA6][..]), + Err(Error::IO(e)) if e.kind() == io::ErrorKind::UnexpectedEof, + )); + assert!(matches!(read_leb128(&[][..]), Ok(None),)); + assert!(matches!( + read_leb128(&[0x95, 0x9a, 0xef, 0x3a][..]), + Ok(Some(123456789)), + )); + assert!(matches!( + read_leb128(&[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01][..]), + Ok(Some(u64::MAX)), + )); + assert!(matches!(read_leb128(&[0x00][..]), Ok(Some(0)))); + assert!(matches!(read_leb128(&[0x01][..]), Ok(Some(1)))); + assert!(matches!(read_leb128(&[0x7f][..]), Ok(Some(0x7f)))); + assert!(matches!(read_leb128(&[0x7f][..]), Ok(Some(127)))); + assert!(matches!(read_leb128(&[0x80, 0x01][..]), Ok(Some(128)))); + assert!(matches!(read_leb128(&[0x80, 0x01][..]), Ok(Some(128)))); + assert!(matches!(read_leb128(&[0xff, 0x01][..]), Ok(Some(255)))); + assert!(matches!(read_leb128(&[0x80, 0x02][..]), Ok(Some(256)))); + } +} diff --git a/rust-crates/symblib/src/symbfile/records.rs b/rust-crates/symblib/src/symbfile/records.rs new file mode 100644 index 00000000..5c81ddf5 --- /dev/null +++ b/rust-crates/symblib/src/symbfile/records.rs @@ -0,0 +1,190 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Rust representation of the payload data in `symbfile` files. +//! +//! This is a higher-level, more idiomatic representation of the protobuf +//! messages, hiding away implementation details like strings getting replaced +//! with references into the line table, relative integer encodings and columnar +//! representations. + +use crate::VirtAddr; + +use smallvec::SmallVec; + +/// [`Range`] or [`ReturnPad`] symbfile record. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Record { + /// Range symbol information. + Range(Range), + /// Point symbol information. + ReturnPad(ReturnPad), +} + +impl Record { + /// Assume that the record is a range and unwrap it. + /// + /// # Panics + /// + /// If the record is not in fact a range. + pub fn unwrap_range(self) -> Range { + match self { + Record::Range(range) => range, + _ => panic!("tried to unwrap a non-range as a range"), + } + } +} + +/// Create a [`Record`] from a [`Range`]. +impl From for Record { + fn from(x: Range) -> Self { + Self::Range(x) + } +} + +/// Create a [`Record`] from a [`ReturnPad`]. +impl From for Record { + fn from(x: ReturnPad) -> Self { + Self::ReturnPad(x) + } +} + +/// High-level representation of the [`RangeV1`] protobuf struct. +/// +/// Please refer to [raw protobuf message][`RangeV1`] for more details. +/// +/// [`RangeV1`]: super::proto::RangeV1 +#[allow(missing_docs)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Range { + /// Start address of the instruction range, in ELF virtual address space. + pub elf_va: VirtAddr, + /// Length of the instruction sequence. + pub length: u32, + /// Demangled name of the function. + pub func: String, + /// Source file that these instructions were generated from. + pub file: Option, + /// The file that issued the call to the inline function. `None` if depth = 0 + /// or if the call file is equal to the file of the parent record record + /// (depth - 1). + pub call_file: Option, + /// Absolute line number of the call to the inline function. 0 if depth is 0. + pub call_line: Option, + /// Depth in the inline function tree, starting at 0 for the top-level function. + pub depth: u32, + /// Line table for this executable range. + pub line_table: SmallVec<[LineTableEntry; 8]>, +} + +impl Range { + /// Construct a range from the `elf_va` and `length` fields. + pub fn va_range(&self) -> std::ops::Range { + self.elf_va..(self.elf_va.saturating_add(u64::from(self.length))) + } + + /// Looks up the line number for the given virtual address. + /// + /// Note that the result of this method is only valid if you made sure that + /// this range is the most concrete (highest depth) instance covering this + /// range. For ranges that are covered by other inline instances, please + /// refer to the `call_line` field in the `depth + 1` range instead. + pub fn line_number_for_va(&self, va: VirtAddr) -> Option { + let Some(max_offs) = va.checked_sub(self.elf_va) else { + return None; + }; + + let mut line = None; + for lte in &self.line_table { + if lte.offset as VirtAddr > max_offs { + break; + } + line = Some(lte.line_number); + } + + line + } +} + +/// High-level representation of the [`LineTableEntry`] protobuf struct. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LineTableEntry { + /// Offset relative to [`Range::elf_va`]. + pub offset: u32, + /// Line number in the source file. + pub line_number: u32, +} + +/// High-level representation of the [`ReturnPadV1`] protobuf struct. +/// +/// Please refer to [raw protobuf message][`ReturnPadV1`] for more details. +/// +/// [`ReturnPadV1`]: super::proto::ReturnPadV1 +#[allow(missing_docs)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ReturnPad { + /// Address of the return pad, in ELF virtual address space. + pub elf_va: VirtAddr, + + /// Inline stack trace for the address. + pub entries: SmallVec<[ReturnPadEntry; 4]>, +} + +/// AoS representation of the [`ReturnPadV1`] columnar stack trace. +/// +/// Please refer to the [raw protobuf message][`ReturnPadV1`] for details. +/// +/// [`ReturnPadV1`]: super::proto::ReturnPadV1 +#[allow(missing_docs)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ReturnPadEntry { + /// Name of the function. + pub func: String, + /// Source file that these instructions were generated from. + pub file: Option, + /// Absolute source line number. + pub line: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + use smallvec::smallvec; + + #[test] + fn line_number_lookup() { + let range = Range { + elf_va: 0x123, + depth: 0, + length: 0, + func: "".into(), + file: None, + call_line: None, + call_file: None, + line_table: smallvec![ + LineTableEntry { + offset: 2, + line_number: 2, + }, + LineTableEntry { + offset: 4, + line_number: 5, + }, + LineTableEntry { + offset: 100, + line_number: 99, + }, + ], + }; + + assert_eq!(range.line_number_for_va(0x123 - 1), None); + assert_eq!(range.line_number_for_va(0x123 + 0), None); + assert_eq!(range.line_number_for_va(0x123 + 1), None); + assert_eq!(range.line_number_for_va(0x123 + 2), Some(2)); + assert_eq!(range.line_number_for_va(0x123 + 3), Some(2)); + assert_eq!(range.line_number_for_va(0x123 + 4), Some(5)); + assert_eq!(range.line_number_for_va(0x123 + 99), Some(5)); + assert_eq!(range.line_number_for_va(0x123 + 100), Some(99)); + assert_eq!(range.line_number_for_va(0x123 + 10000), Some(99)); + } +} diff --git a/rust-crates/symblib/src/symbfile/strdedup.rs b/rust-crates/symblib/src/symbfile/strdedup.rs new file mode 100644 index 00000000..330496e8 --- /dev/null +++ b/rust-crates/symblib/src/symbfile/strdedup.rs @@ -0,0 +1,167 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Internal helper types for string table construction during writing. + +use std::collections::HashMap; + +/// Temporary index handed out while table is being built. +/// +/// Values should be considered opaque / an implementation details. +pub type TempIdx = u32; + +/// Index in the final table. +/// +/// Values are indices into the final `Vec` table. +pub type Idx = u32; + +/// Translates temporary indices to the final ones. +#[derive(Debug)] +pub struct Mapper { + /// Map of unique strings that should be inlined as strings. + unique: HashMap, + + /// Mapping from temporary to final index after removing unique strings. + translation: HashMap, + + /// Actual string table. + table: Vec, +} + +impl Mapper { + /// Translate a given temporary index to the final array position or return + /// the string if it wasn't actually duplicated and should be inlined. + /// + /// This is a destructive action: unique strings are taken out of the table + /// on the first call for their index. + pub fn translate(&mut self, old_idx: TempIdx) -> Mapping { + if let Some(unique) = self.unique.remove(&old_idx) { + return Mapping::Unique(unique); + } + + if let Some(new_idx) = self.translation.get(&old_idx) { + return Mapping::Translate(*new_idx); + } + + unreachable!("bug: invalid index passed to internal `translate` function") + } + + /// Forces an entry into the string table even if it is unique. + pub fn force_entry(&mut self, old_idx: TempIdx) -> Idx { + match self.translate(old_idx) { + // String was previously not in the table due to being unique: move. + Mapping::Unique(s) => { + let new_idx = self.table.len() as Idx; + self.table.push(s); + self.translation.insert(old_idx, new_idx); + new_idx + } + + // Entry exists already: just return new index. + Mapping::Translate(idx) => idx, + } + } + + /// Consume the translator, returning the final table. + pub fn into_table(self) -> Vec { + self.table + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Mapping { + /// String is unique within current sequence: inline it. + Unique(String), + + /// Replace old index with the given new index. + Translate(Idx), +} + +/// Incrementally de-duplicates strings. +#[derive(Debug, Default)] +pub struct Builder { + entries: HashMap, + size_estimate: usize, + next_id: TempIdx, +} + +impl Builder { + /// Look up or construct the temporary ID for the given string. + pub fn index_for_str(&mut self, s: String) -> TempIdx { + match self.entries.get_mut(&s) { + Some(entry) => { + entry.count += 1; + entry.id + } + None => { + // 5 = maximum length of var-int u32 + self.size_estimate += 5 + s.len(); + let id = self.next_id; + let entry = BuilderEntry { id, count: 1 }; + self.entries.insert(s, entry); + self.next_id += 1; + id + } + } + } + + /// Estimated serialized size of the string table, in bytes. + pub fn size_estimate(&self) -> usize { + self.size_estimate + } + + /// Consume the builder, constructing the final string table. + pub fn build(self) -> Mapper { + let (duped, unique): (Vec<_>, Vec<_>) = + self.entries.into_iter().partition(|x| x.1.count > 1); + + let unique: HashMap<_, _> = unique + .into_iter() + .map(|entry| (entry.1.id, entry.0)) + .collect(); + + let translation: HashMap<_, _> = duped + .iter() + .enumerate() + .map(|(new_idx, entry)| (entry.1.id, new_idx as Idx)) + .collect(); + + let table = duped.into_iter().map(|x| x.0).collect(); + + Mapper { + unique, + translation, + table, + } + } +} + +#[derive(Debug)] +struct BuilderEntry { + id: TempIdx, + count: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dedup() { + let mut builder = Builder::default(); + + let id_abc = builder.index_for_str("abc".into()); + let id_bcd = builder.index_for_str("bcd".into()); + assert_eq!(id_abc, builder.index_for_str("abc".into())); + assert_ne!(id_abc, id_bcd); + let id_xyz = builder.index_for_str("xyz".into()); + + let mut mapper = builder.build(); + assert_eq!(mapper.translate(id_abc), Mapping::Translate(0)); + assert_eq!(mapper.force_entry(id_xyz), 1); + assert_eq!(mapper.translate(id_bcd), Mapping::Unique("bcd".to_owned())); + + let table = mapper.into_table(); + assert_eq!(table, &["abc", "xyz"]); + } +} diff --git a/rust-crates/symblib/src/symbfile/write.rs b/rust-crates/symblib/src/symbfile/write.rs new file mode 100644 index 00000000..084144d9 --- /dev/null +++ b/rust-crates/symblib/src/symbfile/write.rs @@ -0,0 +1,266 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Implements a writer for the `symbfile` file format. + +use super::*; +use crate::symbfile::proto::MessageType; +use crate::VirtAddr; +use std::{io, mem}; + +/// Writer for the `symbfile` file format. +#[derive(Debug)] +pub struct Writer { + out: O, + write_buf: Vec, + string_table: strdedup::Builder, + buffered_msgs: Vec, + prev_elf_va: Option, +} + +/// Calculate either a relative or absolute ELF VA update. +/// +/// This is a macro because despite having exactly the same signature, the +/// `ElfVa` types in return pads and ranges are distinct auto-generated types, +/// so a regular function couldn't abstract over both of them. +macro_rules! elf_va_abs2rel { + ($this:expr, $record:expr) => {{ + match mem::replace(&mut $this.prev_elf_va, Some($record.elf_va)) { + Some(prev) => { + let i128_delta = $record.elf_va as i128 - prev as i128; + let maybe_i64_delta: Result = i128_delta.try_into(); + + // Jumps further away than 2^63 cannot be expressed as i64: + // use absolute updates to represent such cases. + if let Ok(delta) = maybe_i64_delta { + Some(ElfVa::DeltaElfVa(delta)) + } else { + Some(ElfVa::SetElfVa($record.elf_va)) + } + } + None => Some(ElfVa::SetElfVa($record.elf_va)), + } + }}; +} + +impl Writer { + /// Create a new writer that outputs into `out`. + pub fn new(out: O) -> Result { + let mut writer = Writer { + out, + write_buf: Vec::with_capacity(MSG_BUF_CAPACITY), + string_table: strdedup::Builder::default(), + buffered_msgs: Vec::new(), + prev_elf_va: None, + }; + + writer.out.write_all(FILE_MAGIC)?; + writer.write_msg(MessageType::MtHeader, proto::Header {})?; + + Ok(writer) + } + + /// Write a record to the file. + pub fn write(&mut self, record: impl Into) -> Result { + let msg = match record.into() { + Record::Range(record) => BufferedMsg::Range(self.serialize_range(record)?), + Record::ReturnPad(pad) => BufferedMsg::ReturnPad(self.serialize_return_pad(pad)?), + }; + + self.buffered_msgs.push(msg); + + if self.string_table.size_estimate() >= STRING_TABLE_SIZE_FLUSH_THRESH as usize + || self.buffered_msgs.len() * mem::size_of::() >= WRITER_MSG_BUFFER_SIZE + { + self.flush_buffered_msgs()?; + } + + Ok(()) + } + + /// Convert a high-level range to the wire format. + fn serialize_range(&mut self, range: Range) -> Result { + use proto::range_v1::{CallFile, ElfVa, File, Func}; + + let line_table = if range.line_table.is_empty() { + None + } else { + let mut columnar = proto::LineTable { + offset: Vec::with_capacity(range.line_table.len()), + line_number: Vec::with_capacity(range.line_table.len()), + }; + + let mut prev_offset = 0; + for row in range.line_table { + let offset_delta = row + .offset + .checked_sub(prev_offset) + .ok_or(Error::LineTableNotSorted)?; + + columnar.offset.push(offset_delta); + columnar.line_number.push(row.line_number); + + prev_offset = row.offset; + } + + debug_assert_eq!(columnar.offset.len(), columnar.line_number.len()); + + Some(columnar) + }; + + // Transform the remaining data. + Ok(proto::RangeV1 { + elf_va: elf_va_abs2rel!(self, range), + length: u64::from(range.length), + func: Some(Func::FuncRef(self.string_table.index_for_str(range.func))), + file: range + .file + .map(|x| File::FileRef(self.string_table.index_for_str(x))), + call_line: range.call_line.unwrap_or(0), + call_file: range + .call_file + .map(|x| CallFile::CallFileRef(self.string_table.index_for_str(x))), + depth: range.depth, + line_table, + }) + } + + /// Convert a high-level return pad to the wire format. + fn serialize_return_pad(&mut self, pad: ReturnPad) -> Result { + use proto::return_pad_v1::ElfVa; + let mut return_pads = proto::ReturnPadV1 { + elf_va: elf_va_abs2rel!(self, pad), + func: Vec::with_capacity(pad.entries.len()), + file: Vec::with_capacity(pad.entries.len()), + line: Vec::with_capacity(pad.entries.len()), + }; + + for entry in pad.entries { + return_pads + .func + .push(self.string_table.index_for_str(entry.func)); + return_pads.file.push( + self.string_table + .index_for_str(entry.file.unwrap_or_default()), + ); + return_pads.line.push(entry.line.unwrap_or_default()); + } + + Ok(return_pads) + } + + /// Write out and clear the buffered messages and string table. + fn flush_buffered_msgs(&mut self) -> Result { + let mut translator = mem::take(&mut self.string_table).build(); + let mut msgs = mem::take(&mut self.buffered_msgs); + + // Fixing up the messages can still mutate the final string table, so + // we have to do two passes through the array for fixup and sending. + for buffered_msg in &mut msgs { + match buffered_msg { + BufferedMsg::Range(range) => Self::fix_up_range(&mut translator, range), + BufferedMsg::ReturnPad(pad) => Self::fix_up_return_pad(&mut translator, pad), + }; + } + + self.write_msg( + MessageType::MtStringTableV1, + proto::StringTableV1 { + strings: translator.into_table(), + }, + )?; + + for buffered_msg in msgs { + match buffered_msg { + BufferedMsg::Range(range) => self.write_msg(MessageType::MtRangeV1, range)?, + BufferedMsg::ReturnPad(pad) => self.write_msg(MessageType::MtReturnPadV1, pad)?, + }; + } + + Ok(()) + } + + fn fix_up_range(trans: &mut strdedup::Mapper, range: &mut proto::RangeV1) { + use proto::range_v1::{CallFile, File, Func}; + use strdedup::Mapping::*; + + let Some(Func::FuncRef(func_idx)) = &mut range.func else { + unreachable!("bug: non index func field") + }; + range.func = match trans.translate(*func_idx) { + Unique(x) => Some(Func::FuncStr(x)), + Translate(x) => Some(Func::FuncRef(x)), + }; + + if let Some(file) = &mut range.file { + let File::FileRef(file_idx) = file else { + unreachable!("bug: non index file field") + }; + *file = match trans.translate(*file_idx) { + Unique(x) => File::FileStr(x), + Translate(x) => File::FileRef(x), + }; + } + + if let Some(call_file) = &mut range.call_file { + let CallFile::CallFileRef(call_file_idx) = call_file else { + unreachable!("bug: non index call_file field") + }; + *call_file = match trans.translate(*call_file_idx) { + Unique(x) => CallFile::CallFileStr(x), + Translate(x) => CallFile::CallFileRef(x), + }; + } + } + + fn fix_up_return_pad(trans: &mut strdedup::Mapper, pad: &mut proto::ReturnPadV1) { + for field in [&mut pad.func, &mut pad.file] { + for item in field { + *item = trans.force_entry(*item); + } + } + } + + /// Finalize the file, flushing all remaining buffers. + /// + /// Returns the output stream once all buffers are flushed. + pub fn finalize(mut self) -> Result { + self.flush_buffered_msgs()?; + self.out.flush()?; + Ok(self.out) + } + + /// Gets an immutable reference to the underlying stream. + pub fn stream_ref(&self) -> &O { + &self.out + } + + /// Write the given message to the output stream. + /// + /// `write_buf` is used as a temporary buffer to avoid unnecessarily + /// allocating and freeing on every call. + fn write_msg(&mut self, kind: MessageType, msg: impl prost::Message) -> Result { + self.write_buf.clear(); + + let encoded_len = msg.encoded_len(); + if encoded_len > MAX_MSG_SIZE as usize { + return Err(Error::MaximumMsgSizeExceeded(encoded_len as u64)); + } + + prost::encode_length_delimiter(encoded_len, &mut self.write_buf)?; + prost::encode_length_delimiter(kind as usize, &mut self.write_buf)?; + msg.encode(&mut self.write_buf)?; + self.out.write_all(&self.write_buf)?; + + // Make sure the write buffer doesn't stay huge if one message was big. + self.write_buf.shrink_to(MSG_BUF_CAPACITY); + + Ok(()) + } +} + +#[derive(Debug)] +enum BufferedMsg { + Range(proto::RangeV1), + ReturnPad(proto::ReturnPadV1), +} diff --git a/rust-crates/symblib/testdata/Makefile b/rust-crates/symblib/testdata/Makefile new file mode 100644 index 00000000..8b584f33 --- /dev/null +++ b/rust-crates/symblib/testdata/Makefile @@ -0,0 +1,31 @@ +.PHONY: inline inline-compressed-dwarf inline-split-dwarf inline-big-fake-compressed-dwarf \ + inline-no-tco clean inline-compressed-dwarf-zstd + +all: inline inline-compressed-dwarf inline-split-dwarf inline-big-fake-compressed-dwarf \ + inline-no-tco inline-compressed-dwarf-zstd + +inline: inline.c + cc $< -o $@ -O2 -g + +inline-no-tco: inline.c + cc $< -o $@ -O2 -g -fno-omit-frame-pointer -fno-optimize-sibling-calls + +inline-compressed-dwarf: inline + objcopy --compress-debug-sections=zlib $< $@ + +inline-compressed-dwarf-zstd: inline + objcopy --compress-debug-sections=zstd $< $@ + +inline-big-fake-compressed-dwarf: inline + dd if=/dev/zero bs=4M count=16 of=/tmp/big-fake-dwarf + # objcopy only supports compressing DWARF sections, not arbitrary ones, + # so we swap a DWARF section here to work around that limitation + objcopy --update-section .debug_info=/tmp/big-fake-dwarf $< $@ + objcopy --compress-debug-sections $@ + +inline-split-dwarf: inline + cp inline inline-split-dwarf + dwz -M meow -m inline-split-dwarf.dwp inline-split-dwarf inline-split-dwarf + +clean: + echo "not deleting anything: executables are meant to be kept under VC" diff --git a/rust-crates/symblib/testdata/README.md b/rust-crates/symblib/testdata/README.md new file mode 100644 index 00000000..ffe65eb7 --- /dev/null +++ b/rust-crates/symblib/testdata/README.md @@ -0,0 +1,2 @@ +Executables here are intentionally kept in VC to allow tests to hard-code aspects like text +section sizes and build IDs to validate their parsing. \ No newline at end of file diff --git a/rust-crates/symblib/testdata/inline b/rust-crates/symblib/testdata/inline new file mode 100755 index 00000000..be093a38 Binary files /dev/null and b/rust-crates/symblib/testdata/inline differ diff --git a/rust-crates/symblib/testdata/inline-big-fake-compressed-dwarf b/rust-crates/symblib/testdata/inline-big-fake-compressed-dwarf new file mode 100755 index 00000000..664b2da4 Binary files /dev/null and b/rust-crates/symblib/testdata/inline-big-fake-compressed-dwarf differ diff --git a/rust-crates/symblib/testdata/inline-compressed-dwarf b/rust-crates/symblib/testdata/inline-compressed-dwarf new file mode 100755 index 00000000..ce8d288c Binary files /dev/null and b/rust-crates/symblib/testdata/inline-compressed-dwarf differ diff --git a/rust-crates/symblib/testdata/inline-compressed-dwarf-zstd b/rust-crates/symblib/testdata/inline-compressed-dwarf-zstd new file mode 100755 index 00000000..62bc2260 Binary files /dev/null and b/rust-crates/symblib/testdata/inline-compressed-dwarf-zstd differ diff --git a/rust-crates/symblib/testdata/inline-no-tco b/rust-crates/symblib/testdata/inline-no-tco new file mode 100755 index 00000000..1cee3a2b Binary files /dev/null and b/rust-crates/symblib/testdata/inline-no-tco differ diff --git a/rust-crates/symblib/testdata/inline-no-tco.ranges.symbfile b/rust-crates/symblib/testdata/inline-no-tco.ranges.symbfile new file mode 100644 index 00000000..44a76008 Binary files /dev/null and b/rust-crates/symblib/testdata/inline-no-tco.ranges.symbfile differ diff --git a/rust-crates/symblib/testdata/inline-split-dwarf b/rust-crates/symblib/testdata/inline-split-dwarf new file mode 100755 index 00000000..775fb83b Binary files /dev/null and b/rust-crates/symblib/testdata/inline-split-dwarf differ diff --git a/rust-crates/symblib/testdata/inline-split-dwarf.dwp b/rust-crates/symblib/testdata/inline-split-dwarf.dwp new file mode 100644 index 00000000..1783da63 Binary files /dev/null and b/rust-crates/symblib/testdata/inline-split-dwarf.dwp differ diff --git a/rust-crates/symblib/testdata/inline.c b/rust-crates/symblib/testdata/inline.c new file mode 100644 index 00000000..d84de1df --- /dev/null +++ b/rust-crates/symblib/testdata/inline.c @@ -0,0 +1,44 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define NOINLINE __attribute__((noinline)) +#define INLINE static inline __attribute__((always_inline)) + +NOINLINE int d() { + printf("hello!\n"); +} + +NOINLINE int c() { + d(); +} + +NOINLINE int b() { + c(); +} + +NOINLINE int a() { + b(); +} + +INLINE int d_inline() { + printf("hello!\n"); +} + +INLINE int c_inline() { + d_inline(); +} + +INLINE int b_inline() { + c_inline(); +} + +INLINE int a_inline() { + b_inline(); +} + +NOINLINE int main() { + a(); + a_inline(); +}