diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 803dfa2..c530a2c 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,6 +5,11 @@ # Required version: 2 +build: + os: "ubuntu-22.04" + tools: + python: "3.11" + # Build documentation in the docs/ directory with Sphinx sphinx: builder: html @@ -17,6 +22,5 @@ formats: # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 install: - requirements: docs/sphinx/requirements.txt \ No newline at end of file diff --git a/BUILD.gn b/BUILD.gn index 7f0d3f9..2b913c4 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -68,6 +68,7 @@ ohos_shared_library("libbpf") { "./src/btf.c", "./src/btf.h", "./src/btf_dump.c", + "./src/elf.c", "./src/gen_loader.c", "./src/hashmap.c", "./src/hashmap.h", @@ -90,6 +91,7 @@ ohos_shared_library("libbpf") { "./src/str_error.h", "./src/strset.c", "./src/strset.h", + "./src/zip.c", ] configs = [ ":libbpf_config" ] public_configs = [ ":libbpf_public_config" ] diff --git a/README.OpenSource b/README.OpenSource index 5fbc899..b3563da 100644 --- a/README.OpenSource +++ b/README.OpenSource @@ -3,7 +3,7 @@ "Name": "libbpf", "License": "BSD-2-Clause", "License File": "LICENSE.BSD-2-Clause", - "Version Number": "1.1.0", + "Version Number": "1.3.0", "Owner": "xiazhonglin@huawei.com", "Upstream URL": "https://github.com/libbpf/libbpf", "Description": "a third party library to use eBPF" diff --git a/README.md b/README.md index 9861b16..4642294 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ bpf-next to Github sync ======================= All the gory details of syncing can be found in `scripts/sync-kernel.sh` -script. +script. See [SYNC.md](SYNC.md) for instruction. Some header files in this repo (`include/linux/*.h`) are reduced versions of their counterpart files at diff --git a/SYNC.md b/SYNC.md new file mode 100644 index 0000000..14a44a7 --- /dev/null +++ b/SYNC.md @@ -0,0 +1,281 @@ + + + + + +Libbpf sync +=========== + +Libbpf *authoritative source code* is developed as part of [bpf-next Linux source +tree](https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next) under +`tools/lib/bpf` subdirectory and is periodically synced to Github. + +Most of the mundane mechanical things like bpf and bpf-next tree merge, Git +history transformation, cherry-picking relevant commits, re-generating +auto-generated headers, etc. are taken care by +[sync-kernel.sh script](https://github.com/libbpf/libbpf/blob/master/scripts/sync-kernel.sh). +But occasionally human needs to do few extra things to make everything work +nicely. + +This document goes over the process of syncing libbpf sources from Linux repo +to this Github repository. Feel free to contribute fixes and additions if you +run into new problems not outlined here. + +Setup expectations +------------------ + +Sync script has particular expectation of upstream Linux repo setup. It +expects that current HEAD of that repo points to bpf-next's master branch and +that there is a separate local branch pointing to bpf tree's master branch. +This is important, as the script will automatically merge their histories for +the purpose of libbpf sync. + +Below, we assume that Linux repo is located at `~/linux`, it's current head is +at latest `bpf-next/master`, and libbpf's Github repo is located at +`~/libbpf`, checked out to latest commit on `master` branch. It doesn't matter +from where to run `sync-kernel.sh` script, but we'll be running it from inside +`~/libbpf`. + +``` +$ cd ~/linux && git remote -v | grep -E '^(bpf|bpf-next)' +bpf https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git (fetch) +bpf ssh://git@gitolite.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git +(push) +bpf-next +https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git (fetch) +bpf-next +ssh://git@gitolite.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git (push) +$ git branch -vv | grep -E '^? (master|bpf-master)' +* bpf-master 2d311f480b52 [bpf/master] riscv, bpf: Fix patch_text implicit declaration + master c8ee37bde402 [bpf-next/master] libbpf: Fix bpf_xdp_query() in old kernels +$ git checkout bpf-master && git pull && git checkout master && git pull +... +$ git log --oneline -n1 +c8ee37bde402 (HEAD -> master, bpf-next/master) libbpf: Fix bpf_xdp_query() in old kernels +$ cd ~/libbpf && git checkout master && git pull +Your branch is up to date with 'libbpf/master'. +Already up to date. +``` + +Running setup script +-------------------- + +First step is to always run `sync-kernel.sh` script. It expects three arguments: + +``` +$ scripts/sync-kernel.sh +``` + +Note, that we'll store script's entire output in `/tmp/libbpf-sync.txt` and +put it into PR summary later on. **Please store scripts output and include it +in PR summary for others to check for anything unexpected and suspicious.** + +``` +$ scripts/sync-kernel.sh ~/libbpf ~/linux bpf-master | tee /tmp/libbpf-sync.txt +Dumping existing libbpf commit signatures... +WORKDIR: /home/andriin/libbpf +LINUX REPO: /home/andriin/linux +LIBBPF REPO: /home/andriin/libbpf +... +``` + +Most of the time this will go very uneventful. One expected case when sync +script might require user intervention is if `bpf` tree has some libbpf fixes, +which is nowadays not a very frequent occurence. But if that happens, script +will show you a diff between expected state as of latest bpf-next and synced +Github repo state. And will ask if these changes look good. Please use your +best judgement to verify that differences are indeed from expected `bpf` tree +fixes. E.g., it might look like below: + +``` +Comparing list of files... +Comparing file contents... +--- /home/andriin/linux/include/uapi/linux/netdev.h 2023-02-27 16:54:42.270583372 -0800 ++++ /home/andriin/libbpf/include/uapi/linux/netdev.h 2023-02-27 16:54:34.615530796 -0800 +@@ -19,7 +19,7 @@ + * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP + * in zero copy mode. + * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw +- * oflloading. ++ * offloading. + * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear + * XDP buffer support in the driver napi callback. + * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements +/home/andriin/linux/include/uapi/linux/netdev.h and /home/andriin/libbpf/include/uapi/linux/netdev.h are different! +Unfortunately, there are some inconsistencies, please double check. +Does everything look good? [y/N]: +``` + +If it looks sensible and expected, type `y` and script will proceed. + +If sync is successful, your `~/linux` repo will be left in original state on +the original HEAD commit. `~/libbpf` repo will now be on a new branch, named +`libbpf-sync-` (e.g., `libbpf-sync-2023-02-28T00-53-40.072Z`). + +Push this branch into your fork of `libbpf/libbpf` Github repo and create a PR: + +``` +$ git push --set-upstream origin libbpf-sync-2023-02-28T00-53-40.072Z +Enumerating objects: 130, done. +Counting objects: 100% (115/115), done. +Delta compression using up to 80 threads +Compressing objects: 100% (28/28), done. +Writing objects: 100% (32/32), 5.57 KiB | 1.86 MiB/s, done. +Total 32 (delta 21), reused 0 (delta 0), pack-reused 0 +remote: Resolving deltas: 100% (21/21), completed with 9 local objects. +remote: +remote: Create a pull request for 'libbpf-sync-2023-02-28T00-53-40.072Z' on GitHub by visiting: +remote: https://github.com/anakryiko/libbpf/pull/new/libbpf-sync-2023-02-28T00-53-40.072Z +remote: +To github.com:anakryiko/libbpf.git + * [new branch] libbpf-sync-2023-02-28T00-53-40.072Z -> libbpf-sync-2023-02-28T00-53-40.072Z +Branch 'libbpf-sync-2023-02-28T00-53-40.072Z' set up to track remote branch 'libbpf-sync-2023-02-28T00-53-40.072Z' from 'origin'. +``` + +**Please, adjust PR name to have a properly looking timestamp. Libbpf +maintainers will be very thankful for that!** + +By default Github will turn above branch name into PR with subject "Libbpf sync +2023 02 28 t00 53 40.072 z". Please fix this into a proper timestamp, e.g.: +"Libbpf sync 2023-02-28T00:53:40.072Z". Thank you! + +**Please don't forget to paste contents of /tmp/libbpf-sync.txt into PR +summary!** + +Once PR is created, libbpf CI will run a bunch of tests to check that +everything is good. In simple cases that would be all you'd need to do. In more +complicated cases some extra adjustments might be necessary. + +**Please, keep naming and style consistent.** Prefix CI-related fixes with `ci: ` +prefix. If you had to modify sync script, prefix it with `sync: `. Also make +sure that each such commit has `Signed-off-by: Your Full Name `, +just like you'd do that for Linux upstream patch. Libbpf closely follows kernel +conventions and styling, so please help maintaining that. + +Including new sources +--------------------- + +If entirely new source files (typically `*.c`) were added to the library in the +kernel repository, it may be necessary to add these to the build system +manually (you may notice linker errors otherwise), because the script cannot +handle such changes automatically. To that end, edit `src/Makefile` as +necessary. Commit +[c2495832ced4](https://github.com/libbpf/libbpf/commit/c2495832ced4239bcd376b9954db38a6addd89ca) +is an example of how to go about doing that. + +Similarly, if new public API header files were added, the `Makefile` will need +to be adjusted as well. + +Updating allow/deny lists +------------------------- + +Libbpf CI intentionally runs a subset of latest BPF selftests on old kernel +(4.9 and 5.5, currently). It happens from time to time that some tests that +previously were successfully running on old kernels now don't, typically due to +reliance on some freshly added kernel feature. It might look something like this in [CI logs](https://github.com/libbpf/libbpf/actions/runs/4206303272/jobs/7299609578#step:4:2733): + +``` + All error logs: + serial_test_xdp_info:FAIL:get_xdp_none errno=2 + #283 xdp_info:FAIL + Summary: 49/166 PASSED, 5 SKIPPED, 1 FAILED +``` + +In such case we can either work with upstream to fix test to be compatible with +old kernels, or we'll have to add a test into a denylist (or remove it from +allowlist, like was [done](https://github.com/libbpf/libbpf/commit/ea284299025bf85b85b4923191de6463cd43ccd6) +for the case above). + +``` +$ find . -name '*LIST*' +./ci/vmtest/configs/ALLOWLIST-4.9.0 +./ci/vmtest/configs/DENYLIST-5.5.0 +./ci/vmtest/configs/DENYLIST-latest.s390x +./ci/vmtest/configs/DENYLIST-latest +./ci/vmtest/configs/ALLOWLIST-5.5.0 +``` + +Please determine which tests need to be added/removed from which list. And then +add that as a separate commit. **Please keep using the same branch name, so +that the same PR can be updated.** There is no need to open new PRs for each +such fix. + +Regenerating vmlinux.h header +----------------------------- + +To compile latest BPF selftests against old kernels, we check in pre-generated +[vmlinux.h](https://github.com/libbpf/libbpf/blob/master/.github/actions/build-selftests/vmlinux.h) +header file, located at `.github/actions/build-selftests/vmlinux.h`, which +contains type definitions from latest upstream kernel. When after libbpf sync +upstream BPF selftests require new kernel types, we'd need to regenerate +`vmlinux.h` and check it in as well. + +This will looks something like this in [CI logs](https://github.com/libbpf/libbpf/actions/runs/4198939244/jobs/7283214243#step:4:1903): + +``` + In file included from progs/test_spin_lock_fail.c:5: + /home/runner/work/libbpf/libbpf/.kernel/tools/testing/selftests/bpf/bpf_experimental.h:73:53: error: declaration of 'struct bpf_rb_root' will not be visible outside of this function [-Werror,-Wvisibility] + extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + ^ + /home/runner/work/libbpf/libbpf/.kernel/tools/testing/selftests/bpf/bpf_experimental.h:81:35: error: declaration of 'struct bpf_rb_root' will not be visible outside of this function [-Werror,-Wvisibility] + extern void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + ^ + /home/runner/work/libbpf/libbpf/.kernel/tools/testing/selftests/bpf/bpf_experimental.h:90:52: error: declaration of 'struct bpf_rb_root' will not be visible outside of this function [-Werror,-Wvisibility] + extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + ^ + 3 errors generated. + make: *** [Makefile:572: /home/runner/work/libbpf/libbpf/.kernel/tools/testing/selftests/bpf/test_spin_lock_fail.bpf.o] Error 1 + make: *** Waiting for unfinished jobs.... + Error: Process completed with exit code 2. +``` + +You'll need to build latest upstream kernel from `bpf-next` tree, using BPF +selftest configs. Concat arch-agnostic and arch-specific configs, build kernel, +then use bpftool to dump `vmlinux.h`: + +``` +$ cd ~/linux +$ cat tools/testing/selftests/bpf/config \ + tools/testing/selftests/bpf/config.x86_64 > .config +$ make -j$(nproc) olddefconfig all +... +$ bpftool btf dump file ~/linux/vmlinux format c > ~/libbpf/.github/actions/build-selftests/vmlinux.h +$ cd ~/libbpf && git add . && git commit -s +``` + +Check in generated `vmlinux.h`, don't forget to use `ci: ` commit prefix, add +it on top of sync commits. Push to Github and let libbpf CI do the checking for +you. See [this commit](https://github.com/libbpf/libbpf/commit/34212c94a64df8eeb1dd5d064630a65e1dfd4c20) +for reference. + +Troubleshooting +--------------- + +If something goes wrong and sync script exits early or is terminated early by +user, you might end up with `~/linux` repo on temporary sync-related branch. +Don't worry, though, sync script never destroys repo state, it follows +"copy-on-write" philosophy and creates new branches where necessary. So it's +very easy to restore previous state. So if anything goes wrong, it's easy to +start fresh: + +``` +$ git branch | grep -E 'libbpf-.*Z' + libbpf-baseline-2023-02-28T00-43-35.146Z + libbpf-bpf-baseline-2023-02-28T00-43-35.146Z + libbpf-bpf-tip-2023-02-28T00-43-35.146Z + libbpf-squash-base-2023-02-28T00-43-35.146Z +* libbpf-squash-tip-2023-02-28T00-43-35.146Z +$ git cherry-pick --abort +$ git checkout master && git branch | grep -E 'libbpf-.*Z' | xargs git br -D +Switched to branch 'master' +Your branch is up to date with 'bpf-next/master'. +Deleted branch libbpf-baseline-2023-02-28T00-43-35.146Z (was 951bce29c898). +Deleted branch libbpf-bpf-baseline-2023-02-28T00-43-35.146Z (was 3a70e0d4c9d7). +Deleted branch libbpf-bpf-tip-2023-02-28T00-43-35.146Z (was 2d311f480b52). +Deleted branch libbpf-squash-base-2023-02-28T00-43-35.146Z (was 957f109ef883). +Deleted branch libbpf-squash-tip-2023-02-28T00-43-35.146Z (was be66130d2339). +Deleted branch libbpf-tip-2023-02-28T00-43-35.146Z (was 2d311f480b52). +``` + +You might need to do the same for your `~/libbpf` repo sometimes, depending at +which stage sync script was terminated. diff --git a/assets/libbpf-logo-compact-darkbg.png b/assets/libbpf-logo-compact-darkbg.png deleted file mode 100644 index aaef60e..0000000 Binary files a/assets/libbpf-logo-compact-darkbg.png and /dev/null differ diff --git a/assets/libbpf-logo-compact-mono.png b/assets/libbpf-logo-compact-mono.png deleted file mode 100644 index 51daed1..0000000 Binary files a/assets/libbpf-logo-compact-mono.png and /dev/null differ diff --git a/assets/libbpf-logo-compact.png b/assets/libbpf-logo-compact.png deleted file mode 100644 index 99ae3ab..0000000 Binary files a/assets/libbpf-logo-compact.png and /dev/null differ diff --git a/assets/libbpf-logo-sideways-darkbg.png b/assets/libbpf-logo-sideways-darkbg.png deleted file mode 100644 index 00d6d83..0000000 Binary files a/assets/libbpf-logo-sideways-darkbg.png and /dev/null differ diff --git a/assets/libbpf-logo-sideways-mono.png b/assets/libbpf-logo-sideways-mono.png deleted file mode 100644 index 82f1ce5..0000000 Binary files a/assets/libbpf-logo-sideways-mono.png and /dev/null differ diff --git a/assets/libbpf-logo-sideways.png b/assets/libbpf-logo-sideways.png deleted file mode 100644 index 48186c3..0000000 Binary files a/assets/libbpf-logo-sideways.png and /dev/null differ diff --git a/assets/libbpf-logo-sparse-darkbg.png b/assets/libbpf-logo-sparse-darkbg.png deleted file mode 100644 index 5e57430..0000000 Binary files a/assets/libbpf-logo-sparse-darkbg.png and /dev/null differ diff --git a/assets/libbpf-logo-sparse-mono.png b/assets/libbpf-logo-sparse-mono.png deleted file mode 100644 index b02d432..0000000 Binary files a/assets/libbpf-logo-sparse-mono.png and /dev/null differ diff --git a/assets/libbpf-logo-sparse.png b/assets/libbpf-logo-sparse.png deleted file mode 100644 index a306588..0000000 Binary files a/assets/libbpf-logo-sparse.png and /dev/null differ diff --git a/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch new file mode 100644 index 0000000..e631fac --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch @@ -0,0 +1,89 @@ +From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 +From: Anders Roxell +Date: Thu, 9 Nov 2023 18:43:28 +0100 +Subject: [PATCH] selftests: bpf: xskxceiver: ksft_print_msg: fix format type + error + +Crossbuilding selftests/bpf for architecture arm64, format specifies +type error show up like. + +xskxceiver.c:912:34: error: format specifies type 'int' but the argument +has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", + ~~ + %llu + __func__, pkt->pkt_nb, meta->count); + ^~~~~~~~~~~ +xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but + the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ~~~~ ^~~~ + +Fixing the issues by casting to (unsigned long long) and changing the +specifiers to be %llu from %d and %u, since with u64s it might be %llx +or %lx, depending on architecture. + +Signed-off-by: Anders Roxell +Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c +index 591ca9637b23..b604c570309a 100644 +--- a/tools/testing/selftests/bpf/xskxceiver.c ++++ b/tools/testing/selftests/bpf/xskxceiver.c +@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { +- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", +- __func__, pkt->pkt_nb, meta->count); ++ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", ++ __func__, pkt->pkt_nb, ++ (unsigned long long)meta->count); + return false; + } + +@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { +- ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag invalid addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { +- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + +@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); +- ksft_print_msg("Last completion address: %llx\n", addr); ++ ksft_print_msg("Last completion address: %llx\n", ++ (unsigned long long)addr); + return TEST_FAILURE; + } + +@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { +- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", +- __func__, stats.tx_invalid_descs, ++ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", ++ __func__, ++ (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } +-- +2.34.1 + diff --git a/ci/vmtest/configs/ALLOWLIST-5.5.0 b/ci/vmtest/configs/ALLOWLIST-5.5.0 index 87f72f9..73ab4b2 100644 --- a/ci/vmtest/configs/ALLOWLIST-5.5.0 +++ b/ci/vmtest/configs/ALLOWLIST-5.5.0 @@ -16,7 +16,6 @@ global_data global_data_init global_func_args hashmap -l4lb_all legacy_printk linked_funcs linked_maps @@ -33,11 +32,7 @@ raw_tp_writable_test_run rdonly_maps section_names signal_pending -skeleton sockmap_ktls -sockopt -sockopt_inherit -sockopt_multi spinlock stacktrace_map stacktrace_map_raw_tp @@ -50,6 +45,5 @@ tcp_rtt tp_attach_query usdt/urand_pid_attach xdp -xdp_info xdp_noinline xdp_perf diff --git a/ci/vmtest/configs/DENYLIST-5.5.0 b/ci/vmtest/configs/DENYLIST-5.5.0 index d732bed..f7161d0 100644 --- a/ci/vmtest/configs/DENYLIST-5.5.0 +++ b/ci/vmtest/configs/DENYLIST-5.5.0 @@ -1,118 +1,5 @@ -# This file is not used and is there for historic purposes only. -# See ALLOWLIST-5.5.0 instead. +# This complements ALLOWLIST-5.5.0 but excludes subtest that can't work on 5.5 -# PERMANENTLY DISABLED -align # verifier output format changed -atomics # new atomic operations (v5.12+) -atomic_bounds # new atomic operations (v5.12+) -bind_perm # changed semantics of return values (v5.12+) -bpf_cookie # 5.15+ -bpf_iter # bpf_iter support is missing -bpf_obj_id # bpf_link support missing for GET_OBJ_INFO, GET_FD_BY_ID, etc -bpf_tcp_ca # STRUCT_OPS is missing -btf_map_in_map # inner map leak fixed in 5.8 -btf_skc_cls_ingress # v5.10+ functionality -cg_storage_multi # v5.9+ functionality -cgroup_attach_multi # BPF_F_REPLACE_PROG missing -cgroup_link # LINK_CREATE is missing -cgroup_skb_sk_lookup # bpf_sk_lookup_tcp() helper is missing -check_mtu # missing BPF helper (v5.12+) -cls_redirect # bpf_csum_level() helper is missing -connect_force_port # cgroup/get{peer,sock}name{4,6} support is missing -d_path # v5.10+ feature -enable_stats # BPF_ENABLE_STATS support is missing -fentry_fexit # bpf_prog_test_tracing missing -fentry_test # bpf_prog_test_tracing missing -fexit_bpf2bpf # freplace is missing -fexit_sleep # relies on bpf_trampoline fix in 5.12+ -fexit_test # bpf_prog_test_tracing missing -flow_dissector # bpf_link-based flow dissector is in 5.8+ -flow_dissector_reattach -for_each # v5.12+ -get_func_ip_test # v5.15+ -get_stack_raw_tp # exercising BPF verifier bug causing infinite loop -hash_large_key # v5.11+ -ima # v5.11+ -kfree_skb # 32-bit pointer arith in test_pkt_access -ksyms # __start_BTF has different name -kfunc_call # v5.13+ -link_pinning # bpf_link is missing -linked_vars # v5.13+ -load_bytes_relative # new functionality in 5.8 -lookup_and_delete # v5.14+ -map_init # per-CPU LRU missing -map_ptr # test uses BPF_MAP_TYPE_RINGBUF, added in 5.8 -metadata # v5.10+ -migrate_reuseport # v5.14+ -mmap # 5.5 kernel is too permissive with re-mmaping -modify_return # fmod_ret support is missing -module_attach # module BTF support missing (v5.11+) -netcnt -netns_cookie # v5.15+ -ns_current_pid_tgid # bpf_get_ns_current_pid_tgid() helper is missing -pe_preserve_elems # v5.10+ -perf_branches # bpf_read_branch_records() helper is missing -perf_link # v5.15+ -pkt_access # 32-bit pointer arith in test_pkt_access -probe_read_user_str # kernel bug with garbage bytes at the end -prog_run_xattr # 32-bit pointer arith in test_pkt_access -raw_tp_test_run # v5.10+ -recursion # v5.12+ -ringbuf # BPF_MAP_TYPE_RINGBUF is supported in 5.8+ - -# bug in verifier w/ tracking references -#reference_tracking/classifier/sk_lookup_success -reference_tracking - -select_reuseport # UDP support is missing -send_signal # bpf_send_signal_thread() helper is missing -sk_assign # bpf_sk_assign helper missing -sk_lookup # v5.9+ -sk_storage_tracing # missing bpf_sk_storage_get() helper -skb_ctx # ctx_{size, }_{in, out} in BPF_PROG_TEST_RUN is missing -skb_helpers # helpers added in 5.8+ -skeleton # creates too big ARRAY map -snprintf # v5.13+ -snprintf_btf # v5.10+ -sock_fields # v5.10+ -socket_cookie # v5.12+ -sockmap_basic # uses new socket fields, 5.8+ -sockmap_listen # no listen socket supportin SOCKMAP -sockopt_sk -sockopt_qos_to_cc # v5.15+ -stacktrace_build_id # v5.9+ -stack_var_off # v5.12+ -syscall # v5.14+ -task_local_storage # v5.12+ -task_pt_regs # v5.15+ -tcp_hdr_options # v5.10+, new TCP header options feature in BPF -tcpbpf_user # LINK_CREATE is missing -tc_redirect # v5.14+ -test_bpffs # v5.10+, new CONFIG_BPF_PRELOAD=y and CONFIG_BPF_PRELOAD_UMG=y|m -test_bprm_opts # v5.11+ -test_global_funcs # kernel doesn't support BTF linkage=global on FUNCs -test_local_storage # v5.10+ feature -test_lsm # no BPF_LSM support -test_overhead # no fmod_ret support -test_profiler # needs verifier logic improvements from v5.10+ -test_skb_pkt_end # v5.11+ -timer # v5.15+ -timer_mim # v5.15+ -trace_ext # v5.10+ -trace_printk # v5.14+ -trampoline_count # v5.12+ have lower allowed limits -udp_limit # no cgroup/sock_release BPF program type (5.9+) -varlen # verifier bug fixed in later kernels -vmlinux # hrtimer_nanosleep() signature changed incompatibly -xdp_adjust_tail # new XDP functionality added in 5.8 -xdp_attach # IFLA_XDP_EXPECTED_FD support is missing -xdp_bonding # v5.15+ -xdp_bpf2bpf # freplace is missing -xdp_context_test_run # v5.15+ -xdp_cpumap_attach # v5.9+ -xdp_devmap_attach # new feature in 5.8 -xdp_link # v5.9+ - -# SUBTESTS FAILING (block entire test until blocking subtests works properly) btf # "size check test", "func (Non zero vlen)" tailcalls # tailcall_bpf2bpf_1, tailcall_bpf2bpf_2, tailcall_bpf2bpf_3 +tc_bpf/tc_bpf_non_root diff --git a/ci/vmtest/configs/DENYLIST-latest b/ci/vmtest/configs/DENYLIST-latest index e69de29..7b2275e 100644 --- a/ci/vmtest/configs/DENYLIST-latest +++ b/ci/vmtest/configs/DENYLIST-latest @@ -0,0 +1,9 @@ +decap_sanity # weird failure with decap_sanity_ns netns already existing, TBD +empty_skb # waiting the fix in bpf tree to make it to bpf-next +bpf_nf/tc-bpf-ct # test consistently failing on x86: https://github.com/libbpf/libbpf/pull/698#issuecomment-1590341200 +bpf_nf/xdp-ct # test consistently failing on x86: https://github.com/libbpf/libbpf/pull/698#issuecomment-1590341200 +kprobe_multi_bench_attach # suspected to cause crashes in CI +find_vma # test consistently fails on latest kernel, see https://github.com/libbpf/libbpf/issues/754 for details +bpf_cookie/perf_event +send_signal/send_signal_nmi +send_signal/send_signal_nmi_thread diff --git a/ci/vmtest/configs/DENYLIST-latest.s390x b/ci/vmtest/configs/DENYLIST-latest.s390x index 3684f49..e97d7ba 100644 --- a/ci/vmtest/configs/DENYLIST-latest.s390x +++ b/ci/vmtest/configs/DENYLIST-latest.s390x @@ -1,3 +1,4 @@ # TEMPORARY +sockmap_listen/sockhash VSOCK test_vsock_redir usdt/basic # failing verifier due to bounds check after LLVM update usdt/multispec # same as above diff --git a/ci/vmtest/run_selftests.sh b/ci/vmtest/run_selftests.sh index 1dcde76..0e5ac17 100644 --- a/ci/vmtest/run_selftests.sh +++ b/ci/vmtest/run_selftests.sh @@ -13,7 +13,7 @@ read_lists() { if [[ -s "$path" ]]; then cat "$path" fi; - done) | cut -d'#' -f1 | tr -s ' \t\n' ',' + done) | cut -d'#' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -s '\n' ',' } test_progs() { @@ -22,7 +22,7 @@ test_progs() { # "&& true" does not change the return code (it is not executed # if the Python script fails), but it prevents exiting on a # failure due to the "set -e". - ./test_progs ${DENYLIST:+-d$DENYLIST} ${ALLOWLIST:+-a$ALLOWLIST} && true + ./test_progs ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true echo "test_progs:$?" >> "${STATUS_FILE}" foldable end test_progs fi @@ -30,7 +30,7 @@ test_progs() { test_progs_no_alu32() { foldable start test_progs-no_alu32 "Testing test_progs-no_alu32" - ./test_progs-no_alu32 ${DENYLIST:+-d$DENYLIST} ${ALLOWLIST:+-a$ALLOWLIST} && true + ./test_progs-no_alu32 ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true echo "test_progs-no_alu32:$?" >> "${STATUS_FILE}" foldable end test_progs-no_alu32 } @@ -55,6 +55,13 @@ test_verifier() { foldable end vm_init +foldable start kernel_config "Kconfig" + +zcat /proc/config.gz + +foldable end kernel_config + + configs_path=/${PROJECT_NAME}/selftests/bpf local_configs_path=${PROJECT_NAME}/vmtest/configs DENYLIST=$(read_lists \ diff --git a/docs/conf.py b/docs/conf.py index 1d8714e..a1c6d47 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,6 +18,7 @@ extensions = [ 'sphinx.ext.viewcode', 'sphinx.ext.imgmath', 'sphinx.ext.todo', + 'sphinx_rtd_theme', 'breathe', ] diff --git a/docs/index.rst b/docs/index.rst index f9b3b25..7545a20 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,23 +2,32 @@ .. _libbpf: +====== libbpf ====== +If you are looking to develop BPF applications using the libbpf library, this +directory contains important documentation that you should read. + +To get started, it is recommended to begin with the :doc:`libbpf Overview +` document, which provides a high-level understanding of the +libbpf APIs and their usage. This will give you a solid foundation to start +exploring and utilizing the various features of libbpf to develop your BPF +applications. + .. toctree:: :maxdepth: 1 + libbpf_overview API Documentation program_types libbpf_naming_convention libbpf_build -This is documentation for libbpf, a userspace library for loading and -interacting with bpf programs. -All general BPF questions, including kernel functionality, libbpf APIs and -their application, should be sent to bpf@vger.kernel.org mailing list. -You can `subscribe `_ to the -mailing list search its `archive `_. -Please search the archive before asking new questions. It very well might -be that this was already addressed or answered before. +All general BPF questions, including kernel functionality, libbpf APIs and their +application, should be sent to bpf@vger.kernel.org mailing list. You can +`subscribe `_ to the mailing list +search its `archive `_. Please search the archive +before asking new questions. It may be that this was already addressed or +answered before. diff --git a/docs/libbpf_naming_convention.rst b/docs/libbpf_naming_convention.rst index c5ac97f..b5b41b6 100644 --- a/docs/libbpf_naming_convention.rst +++ b/docs/libbpf_naming_convention.rst @@ -83,8 +83,8 @@ This prevents from accidentally exporting a symbol, that is not supposed to be a part of ABI what, in turn, improves both libbpf developer- and user-experiences. -ABI versionning ---------------- +ABI versioning +-------------- To make future ABI extensions possible libbpf ABI is versioned. Versioning is implemented by ``libbpf.map`` version script that is @@ -148,7 +148,7 @@ API documentation convention The libbpf API is documented via comments above definitions in header files. These comments can be rendered by doxygen and sphinx for well organized html output. This section describes the -convention in which these comments should be formated. +convention in which these comments should be formatted. Here is an example from btf.h: diff --git a/docs/libbpf_overview.rst b/docs/libbpf_overview.rst new file mode 100644 index 0000000..f36a2d4 --- /dev/null +++ b/docs/libbpf_overview.rst @@ -0,0 +1,228 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +libbpf Overview +=============== + +libbpf is a C-based library containing a BPF loader that takes compiled BPF +object files and prepares and loads them into the Linux kernel. libbpf takes the +heavy lifting of loading, verifying, and attaching BPF programs to various +kernel hooks, allowing BPF application developers to focus only on BPF program +correctness and performance. + +The following are the high-level features supported by libbpf: + +* Provides high-level and low-level APIs for user space programs to interact + with BPF programs. The low-level APIs wrap all the bpf system call + functionality, which is useful when users need more fine-grained control + over the interactions between user space and BPF programs. +* Provides overall support for the BPF object skeleton generated by bpftool. + The skeleton file simplifies the process for the user space programs to access + global variables and work with BPF programs. +* Provides BPF-side APIS, including BPF helper definitions, BPF maps support, + and tracing helpers, allowing developers to simplify BPF code writing. +* Supports BPF CO-RE mechanism, enabling BPF developers to write portable + BPF programs that can be compiled once and run across different kernel + versions. + +This document will delve into the above concepts in detail, providing a deeper +understanding of the capabilities and advantages of libbpf and how it can help +you develop BPF applications efficiently. + +BPF App Lifecycle and libbpf APIs +================================== + +A BPF application consists of one or more BPF programs (either cooperating or +completely independent), BPF maps, and global variables. The global +variables are shared between all BPF programs, which allows them to cooperate on +a common set of data. libbpf provides APIs that user space programs can use to +manipulate the BPF programs by triggering different phases of a BPF application +lifecycle. + +The following section provides a brief overview of each phase in the BPF life +cycle: + +* **Open phase**: In this phase, libbpf parses the BPF + object file and discovers BPF maps, BPF programs, and global variables. After + a BPF app is opened, user space apps can make additional adjustments + (setting BPF program types, if necessary; pre-setting initial values for + global variables, etc.) before all the entities are created and loaded. + +* **Load phase**: In the load phase, libbpf creates BPF + maps, resolves various relocations, and verifies and loads BPF programs into + the kernel. At this point, libbpf validates all the parts of a BPF application + and loads the BPF program into the kernel, but no BPF program has yet been + executed. After the load phase, it’s possible to set up the initial BPF map + state without racing with the BPF program code execution. + +* **Attachment phase**: In this phase, libbpf + attaches BPF programs to various BPF hook points (e.g., tracepoints, kprobes, + cgroup hooks, network packet processing pipeline, etc.). During this + phase, BPF programs perform useful work such as processing + packets, or updating BPF maps and global variables that can be read from user + space. + +* **Tear down phase**: In the tear down phase, + libbpf detaches BPF programs and unloads them from the kernel. BPF maps are + destroyed, and all the resources used by the BPF app are freed. + +BPF Object Skeleton File +======================== + +BPF skeleton is an alternative interface to libbpf APIs for working with BPF +objects. Skeleton code abstract away generic libbpf APIs to significantly +simplify code for manipulating BPF programs from user space. Skeleton code +includes a bytecode representation of the BPF object file, simplifying the +process of distributing your BPF code. With BPF bytecode embedded, there are no +extra files to deploy along with your application binary. + +You can generate the skeleton header file ``(.skel.h)`` for a specific object +file by passing the BPF object to the bpftool. The generated BPF skeleton +provides the following custom functions that correspond to the BPF lifecycle, +each of them prefixed with the specific object name: + +* ``__open()`` – creates and opens BPF application (```` stands for + the specific bpf object name) +* ``__load()`` – instantiates, loads,and verifies BPF application parts +* ``__attach()`` – attaches all auto-attachable BPF programs (it’s + optional, you can have more control by using libbpf APIs directly) +* ``__destroy()`` – detaches all BPF programs and + frees up all used resources + +Using the skeleton code is the recommended way to work with bpf programs. Keep +in mind, BPF skeleton provides access to the underlying BPF object, so whatever +was possible to do with generic libbpf APIs is still possible even when the BPF +skeleton is used. It's an additive convenience feature, with no syscalls, and no +cumbersome code. + +Other Advantages of Using Skeleton File +--------------------------------------- + +* BPF skeleton provides an interface for user space programs to work with BPF + global variables. The skeleton code memory maps global variables as a struct + into user space. The struct interface allows user space programs to initialize + BPF programs before the BPF load phase and fetch and update data from user + space afterward. + +* The ``skel.h`` file reflects the object file structure by listing out the + available maps, programs, etc. BPF skeleton provides direct access to all the + BPF maps and BPF programs as struct fields. This eliminates the need for + string-based lookups with ``bpf_object_find_map_by_name()`` and + ``bpf_object_find_program_by_name()`` APIs, reducing errors due to BPF source + code and user-space code getting out of sync. + +* The embedded bytecode representation of the object file ensures that the + skeleton and the BPF object file are always in sync. + +BPF Helpers +=========== + +libbpf provides BPF-side APIs that BPF programs can use to interact with the +system. The BPF helpers definition allows developers to use them in BPF code as +any other plain C function. For example, there are helper functions to print +debugging messages, get the time since the system was booted, interact with BPF +maps, manipulate network packets, etc. + +For a complete description of what the helpers do, the arguments they take, and +the return value, see the `bpf-helpers +`_ man page. + +BPF CO-RE (Compile Once – Run Everywhere) +========================================= + +BPF programs work in the kernel space and have access to kernel memory and data +structures. One limitation that BPF applications come across is the lack of +portability across different kernel versions and configurations. `BCC +`_ is one of the solutions for BPF +portability. However, it comes with runtime overhead and a large binary size +from embedding the compiler with the application. + +libbpf steps up the BPF program portability by supporting the BPF CO-RE concept. +BPF CO-RE brings together BTF type information, libbpf, and the compiler to +produce a single executable binary that you can run on multiple kernel versions +and configurations. + +To make BPF programs portable libbpf relies on the BTF type information of the +running kernel. Kernel also exposes this self-describing authoritative BTF +information through ``sysfs`` at ``/sys/kernel/btf/vmlinux``. + +You can generate the BTF information for the running kernel with the following +command: + +:: + + $ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h + +The command generates a ``vmlinux.h`` header file with all kernel types +(:doc:`BTF types <../btf>`) that the running kernel uses. Including +``vmlinux.h`` in your BPF program eliminates dependency on system-wide kernel +headers. + +libbpf enables portability of BPF programs by looking at the BPF program’s +recorded BTF type and relocation information and matching them to BTF +information (vmlinux) provided by the running kernel. libbpf then resolves and +matches all the types and fields, and updates necessary offsets and other +relocatable data to ensure that BPF program’s logic functions correctly for a +specific kernel on the host. BPF CO-RE concept thus eliminates overhead +associated with BPF development and allows developers to write portable BPF +applications without modifications and runtime source code compilation on the +target machine. + +The following code snippet shows how to read the parent field of a kernel +``task_struct`` using BPF CO-RE and libbf. The basic helper to read a field in a +CO-RE relocatable manner is ``bpf_core_read(dst, sz, src)``, which will read +``sz`` bytes from the field referenced by ``src`` into the memory pointed to by +``dst``. + +.. code-block:: C + :emphasize-lines: 6 + + //... + struct task_struct *task = (void *)bpf_get_current_task(); + struct task_struct *parent_task; + int err; + + err = bpf_core_read(&parent_task, sizeof(void *), &task->parent); + if (err) { + /* handle error */ + } + + /* parent_task contains the value of task->parent pointer */ + +In the code snippet, we first get a pointer to the current ``task_struct`` using +``bpf_get_current_task()``. We then use ``bpf_core_read()`` to read the parent +field of task struct into the ``parent_task`` variable. ``bpf_core_read()`` is +just like ``bpf_probe_read_kernel()`` BPF helper, except it records information +about the field that should be relocated on the target kernel. i.e, if the +``parent`` field gets shifted to a different offset within +``struct task_struct`` due to some new field added in front of it, libbpf will +automatically adjust the actual offset to the proper value. + +Getting Started with libbpf +=========================== + +Check out the `libbpf-bootstrap `_ +repository with simple examples of using libbpf to build various BPF +applications. + +See also `libbpf API documentation +`_. + +libbpf and Rust +=============== + +If you are building BPF applications in Rust, it is recommended to use the +`Libbpf-rs `_ library instead of bindgen +bindings directly to libbpf. Libbpf-rs wraps libbpf functionality in +Rust-idiomatic interfaces and provides libbpf-cargo plugin to handle BPF code +compilation and skeleton generation. Using Libbpf-rs will make building user +space part of the BPF application easier. Note that the BPF program themselves +must still be written in plain C. + +Additional Documentation +======================== + +* `Program types and ELF Sections `_ +* `API naming convention `_ +* `Building libbpf `_ +* `API documentation Convention `_ diff --git a/docs/program_types.rst b/docs/program_types.rst index ad4d4d5..63bb888 100644 --- a/docs/program_types.rst +++ b/docs/program_types.rst @@ -56,6 +56,16 @@ described in more detail in the footnotes. | | ``BPF_CGROUP_UDP6_RECVMSG`` | ``cgroup/recvmsg6`` | | + +----------------------------------------+----------------------------------+-----------+ | | ``BPF_CGROUP_UDP6_SENDMSG`` | ``cgroup/sendmsg6`` | | +| +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UNIX_CONNECT`` | ``cgroup/connect_unix`` | | +| +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UNIX_SENDMSG`` | ``cgroup/sendmsg_unix`` | | +| +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UNIX_RECVMSG`` | ``cgroup/recvmsg_unix`` | | +| +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UNIX_GETPEERNAME`` | ``cgroup/getpeername_unix`` | | +| +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UNIX_GETSOCKNAME`` | ``cgroup/getsockname_unix`` | | +-------------------------------------------+----------------------------------------+----------------------------------+-----------+ | ``BPF_PROG_TYPE_CGROUP_SOCK`` | ``BPF_CGROUP_INET4_POST_BIND`` | ``cgroup/post_bind4`` | | + +----------------------------------------+----------------------------------+-----------+ diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 188f51e..c6d3406 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -1 +1,2 @@ -breathe \ No newline at end of file +breathe +sphinx_rtd_theme diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc1a3d2..7a54982 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -19,6 +19,7 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ +#define BPF_MEMSX 0x80 /* load with sign extension */ #define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ #define BPF_XADD 0xc0 /* exclusive add - legacy name */ @@ -931,7 +932,14 @@ enum bpf_map_type { */ BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs + * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE + + * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * deprecated. + */ + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, @@ -986,6 +994,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, }; enum bpf_attach_type { @@ -1033,6 +1042,18 @@ enum bpf_attach_type { BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, + BPF_STRUCT_OPS, + BPF_NETFILTER, + BPF_TCX_INGRESS, + BPF_TCX_EGRESS, + BPF_TRACE_UPROBE_MULTI, + BPF_CGROUP_UNIX_CONNECT, + BPF_CGROUP_UNIX_SENDMSG, + BPF_CGROUP_UNIX_RECVMSG, + BPF_CGROUP_UNIX_GETPEERNAME, + BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, __MAX_BPF_ATTACH_TYPE }; @@ -1049,10 +1070,23 @@ enum bpf_link_type { BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, - + BPF_LINK_TYPE_NETFILTER = 10, + BPF_LINK_TYPE_TCX = 11, + BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, MAX_BPF_LINK_TYPE, }; +enum bpf_perf_event_type { + BPF_PERF_EVENT_UNSPEC = 0, + BPF_PERF_EVENT_UPROBE = 1, + BPF_PERF_EVENT_URETPROBE = 2, + BPF_PERF_EVENT_KPROBE = 3, + BPF_PERF_EVENT_KRETPROBE = 4, + BPF_PERF_EVENT_TRACEPOINT = 5, + BPF_PERF_EVENT_EVENT = 6, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -1099,7 +1133,12 @@ enum bpf_link_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) #define BPF_F_ALLOW_MULTI (1U << 1) +/* Generic attachment flags. */ #define BPF_F_REPLACE (1U << 2) +#define BPF_F_BEFORE (1U << 3) +#define BPF_F_AFTER (1U << 4) +#define BPF_F_ID (1U << 5) +#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel @@ -1108,7 +1147,7 @@ enum bpf_link_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) -/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will allow any alignment whatsoever. On platforms * with strict alignment requirements for loads ands stores (such * as sparc and mips) the verifier validates that all loads and @@ -1156,10 +1195,32 @@ enum bpf_link_type { */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) +enum { + BPF_F_KPROBE_MULTI_RETURN = (1U << 0) +}; + +/* link_create.uprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_UPROBE_MULTI attach type to create return probe. + */ +enum { + BPF_F_UPROBE_MULTI_RETURN = (1U << 0) +}; + +/* link_create.netfilter.flags used in LINK_CREATE command for + * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. + */ +#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: @@ -1261,6 +1322,12 @@ enum { /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), }; /* Flags for BPF_PROG_QUERY. */ @@ -1398,23 +1465,40 @@ union bpf_attr { __aligned_u64 fd_array; /* array of FDs */ __aligned_u64 core_relos; __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ __aligned_u64 pathname; __u32 bpf_fd; __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ - __u32 target_fd; /* container object to attach to */ - __u32 attach_bpf_fd; /* eBPF program to attach */ + union { + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_bpf_fd; __u32 attach_type; __u32 attach_flags; - __u32 replace_bpf_fd; /* previously attached eBPF - * program to replace if - * BPF_F_REPLACE is used - */ + __u32 replace_bpf_fd; + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -1460,16 +1544,26 @@ union bpf_attr { } info; struct { /* anonymous struct used by BPF_PROG_QUERY command */ - __u32 target_fd; /* container object to query */ + union { + __u32 target_fd; /* target object to query or ... */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; __u32 query_flags; __u32 attach_flags; __aligned_u64 prog_ids; - __u32 prog_cnt; + union { + __u32 prog_cnt; + __u32 count; + }; + __u32 :32; /* output: per-program attach_flags. * not allowed to be set during effective query. */ __aligned_u64 prog_attach_flags; + __aligned_u64 link_ids; + __aligned_u64 link_attach_flags; + __u64 revision; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ @@ -1483,6 +1577,11 @@ union bpf_attr { __u32 btf_size; __u32 btf_log_size; __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; }; struct { @@ -1502,15 +1601,18 @@ union bpf_attr { } task_fd_query; struct { /* struct used by BPF_LINK_CREATE command */ - __u32 prog_fd; /* eBPF program to attach */ union { - __u32 target_fd; /* object to attach to */ - __u32 target_ifindex; /* target ifindex */ + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; + union { + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ union { - __u32 target_btf_id; /* btf_id of target to attach to */ + __u32 target_btf_id; /* btf_id of target to attach to */ struct { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ @@ -1538,17 +1640,57 @@ union bpf_attr { */ __u64 cookie; } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } tcx; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 cnt; + __u32 flags; + __u32 pid; + } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ __u32 link_fd; /* link fd */ - /* new program fd to update link with */ - __u32 new_prog_fd; + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; __u32 flags; /* extra flags */ - /* expected link's program fd; is specified only if - * BPF_F_REPLACE flag is set in flags */ - __u32 old_prog_fd; + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; } link_update; struct { @@ -1642,17 +1784,17 @@ union bpf_attr { * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) - * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * to file *\/sys/kernel/tracing/trace* from TraceFS, if * available. It can take up to three additional **u64** * arguments (as an eBPF helpers, the total number of arguments is * limited to five). * * Each time the helper is called, it appends a line to the trace. - * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is - * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. * The format of the trace is customizable, and the exact output * one will get depends on the options set in - * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *\/sys/kernel/tracing/trace_options* (see also the * *README* file under the same directory). However, it usually * defaults to something like: * @@ -1845,7 +1987,9 @@ union bpf_attr { * performed again, if the helper is used in combination with * direct packet access. * Return - * 0 on success, or a negative error in case of failure. + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. * * u64 bpf_get_current_pid_tgid(void) * Description @@ -2578,8 +2722,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -2647,6 +2791,11 @@ union bpf_attr { * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2791,7 +2940,7 @@ union bpf_attr { * * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description - * For en eBPF program attached to a perf event, retrieve the + * For an eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in * the structure pointed by *buf* and of size *buf_size*. Enabled * and running times are also stored in the structure (see @@ -2812,8 +2961,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by @@ -3121,9 +3270,23 @@ union bpf_attr { * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -4093,9 +4256,6 @@ union bpf_attr { * **-EOPNOTSUPP** if the operation is not supported, for example * a call from outside of TC ingress. * - * **-ESOCKTNOSUPPORT** if the socket type is not supported - * (reuseport). - * * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This @@ -4360,6 +4520,8 @@ union bpf_attr { * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -4371,6 +4533,7 @@ union bpf_attr { * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. @@ -4954,6 +5117,14 @@ union bpf_attr { * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. + * * Return * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier @@ -4972,9 +5143,14 @@ union bpf_attr { * u64 bpf_get_func_ip(void *ctx) * Description * Get address of the traced function (for tracing and kprobe programs). + * + * When called for kprobe program attached as uprobe it returns + * probe address for both entry and return uprobe. + * * Return - * Address of the traced function. + * Address of the traced function for kprobe. * 0 for kprobes placed within the function (not at the entry). + * Address of the probe for uprobe and return uprobe. * * u64 bpf_get_attach_cookie(void *ctx) * Description @@ -5310,11 +5486,22 @@ union bpf_attr { * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. - * *flags* is currently unused. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr or if *flags* is not 0. + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description @@ -5322,6 +5509,9 @@ union bpf_attr { * * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length @@ -5807,6 +5997,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { @@ -6099,6 +6291,19 @@ struct bpf_sock_tuple { }; }; +/* (Simplified) user return codes for tcx prog type. + * A valid tcx program must return one of these defined values. All other + * return codes are reserved for future use. Must remain compatible with + * their TC_ACT_* counter-parts. For compatibility in behavior, unknown + * return codes are mapped to TCX_NEXT. + */ +enum tcx_action_base { + TCX_NEXT = -1, + TCX_PASS = 0, + TCX_DROP = 2, + TCX_REDIRECT = 7, +}; + struct bpf_xdp_sock { __u32 queue_id; }; @@ -6342,6 +6547,55 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 map_id; + } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + struct { + __aligned_u64 addrs; + __u32 count; /* in/out: kprobe_multi function count */ + __u32 flags; + __u64 missed; + } kprobe_multi; + struct { + __u32 type; /* enum bpf_perf_event_type */ + __u32 :32; + union { + struct { + __aligned_u64 file_name; /* in/out */ + __u32 name_len; + __u32 offset; /* offset from file_name */ + } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */ + struct { + __aligned_u64 func_name; /* in/out */ + __u32 name_len; + __u32 offset; /* offset from func_name */ + __u64 addr; + __u64 missed; + } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ + struct { + __aligned_u64 tp_name; /* in/out */ + __u32 name_len; + } tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */ + struct { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ + }; + } perf_event; + struct { + __u32 ifindex; + __u32 attach_type; + } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; }; } __attribute__((aligned(8))); @@ -6738,6 +6992,9 @@ struct bpf_raw_tracepoint_args { enum { BPF_FIB_LOOKUP_DIRECT = (1U << 0), BPF_FIB_LOOKUP_OUTPUT = (1U << 1), + BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6750,6 +7007,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -6784,6 +7042,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ @@ -6798,9 +7059,19 @@ struct bpf_fib_lookup { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; - /* output */ - __be16 h_vlan_proto; - __be16 h_vlan_TCI; + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + __u8 smac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */ }; @@ -6886,25 +7157,33 @@ struct bpf_spin_lock { }; struct bpf_timer { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_dynptr { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_head { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_node { - __u64 :64; - __u64 :64; + __u64 __opaque[3]; } __attribute__((aligned(8))); +struct bpf_rb_root { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 __opaque[4]; +} __attribute__((aligned(8))); + +struct bpf_refcount { + __u32 __opaque[1]; +} __attribute__((aligned(4))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. @@ -7054,4 +7333,23 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), + BPF_F_TIMER_CPU_PIN = (1ULL << 1), +}; + +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 2f86b2a..6c80f96 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -43,6 +43,7 @@ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ /* (1U << 31) is reserved for signed error codes */ /* @@ -111,4 +112,9 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ +#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to + compare object identity and may not + be usable to open_by_handle_at(2) */ + #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 901d98b..a0aa05a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -211,6 +211,9 @@ struct rtnl_link_stats { * @rx_nohandler: Number of packets received on the interface * but dropped by the networking stack because the device is * not designated to receive packets (e.g. backup link in a bond). + * + * @rx_otherhost_dropped: Number of packets dropped due to mismatch + * in destination MAC address. */ struct rtnl_link_stats64 { __u64 rx_packets; @@ -243,6 +246,23 @@ struct rtnl_link_stats64 { __u64 rx_compressed; __u64 tx_compressed; __u64 rx_nohandler; + + __u64 rx_otherhost_dropped; +}; + +/* Subset of link stats useful for in-HW collection. Meaning of the fields is as + * for struct rtnl_link_stats64. + */ +struct rtnl_hw_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; }; /* The struct should be in sync with struct ifmap */ @@ -350,7 +370,13 @@ enum { IFLA_GRO_MAX_SIZE, IFLA_TSO_MAX_SIZE, IFLA_TSO_MAX_SEGS, + IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + IFLA_DEVLINK_PORT, + + IFLA_GSO_IPV4_MAX_SIZE, + IFLA_GRO_IPV4_MAX_SIZE, + IFLA_DPLL_PIN, __IFLA_MAX }; @@ -539,6 +565,12 @@ enum { IFLA_BRPORT_MRP_IN_OPEN, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, + IFLA_BRPORT_MCAST_N_GROUPS, + IFLA_BRPORT_MCAST_MAX_GROUPS, + IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, + IFLA_BRPORT_BACKUP_NHID, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -605,6 +637,7 @@ enum { IFLA_MACVLAN_MACADDR_COUNT, IFLA_MACVLAN_BC_QUEUE_LEN, IFLA_MACVLAN_BC_QUEUE_LEN_USED, + IFLA_MACVLAN_BC_CUTOFF, __IFLA_MACVLAN_MAX, }; @@ -715,7 +748,79 @@ enum ipvlan_mode { #define IPVLAN_F_PRIVATE 0x01 #define IPVLAN_F_VEPA 0x02 +/* Tunnel RTM header */ +struct tunnel_msg { + __u8 family; + __u8 flags; + __u16 reserved2; + __u32 ifindex; +}; + +/* netkit section */ +enum netkit_action { + NETKIT_NEXT = -1, + NETKIT_PASS = 0, + NETKIT_DROP = 2, + NETKIT_REDIRECT = 7, +}; + +enum netkit_mode { + NETKIT_L2, + NETKIT_L3, +}; + +enum { + IFLA_NETKIT_UNSPEC, + IFLA_NETKIT_PEER_INFO, + IFLA_NETKIT_PRIMARY, + IFLA_NETKIT_POLICY, + IFLA_NETKIT_PEER_POLICY, + IFLA_NETKIT_MODE, + __IFLA_NETKIT_MAX, +}; +#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) + /* VXLAN section */ + +/* include statistics in the dump */ +#define TUNNEL_MSG_FLAG_STATS 0x01 + +#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS + +/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */ +enum { + VNIFILTER_ENTRY_STATS_UNSPEC, + VNIFILTER_ENTRY_STATS_RX_BYTES, + VNIFILTER_ENTRY_STATS_RX_PKTS, + VNIFILTER_ENTRY_STATS_RX_DROPS, + VNIFILTER_ENTRY_STATS_RX_ERRORS, + VNIFILTER_ENTRY_STATS_TX_BYTES, + VNIFILTER_ENTRY_STATS_TX_PKTS, + VNIFILTER_ENTRY_STATS_TX_DROPS, + VNIFILTER_ENTRY_STATS_TX_ERRORS, + VNIFILTER_ENTRY_STATS_PAD, + __VNIFILTER_ENTRY_STATS_MAX +}; +#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1) + +enum { + VXLAN_VNIFILTER_ENTRY_UNSPEC, + VXLAN_VNIFILTER_ENTRY_START, + VXLAN_VNIFILTER_ENTRY_END, + VXLAN_VNIFILTER_ENTRY_GROUP, + VXLAN_VNIFILTER_ENTRY_GROUP6, + VXLAN_VNIFILTER_ENTRY_STATS, + __VXLAN_VNIFILTER_ENTRY_MAX +}; +#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) + +enum { + VXLAN_VNIFILTER_UNSPEC, + VXLAN_VNIFILTER_ENTRY, + __VXLAN_VNIFILTER_MAX +}; +#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1) + enum { IFLA_VXLAN_UNSPEC, IFLA_VXLAN_ID, @@ -747,6 +852,8 @@ enum { IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -780,6 +887,7 @@ enum { IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, IFLA_GENEVE_DF, + IFLA_GENEVE_INNER_PROTO_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) @@ -825,6 +933,8 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_CREATE_SOCKETS, + IFLA_GTP_RESTART_COUNT, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) @@ -1161,6 +1271,17 @@ enum { #define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) +enum { + IFLA_STATS_GETSET_UNSPEC, + IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with + * a filter mask for the corresponding group. + */ + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */ + __IFLA_STATS_GETSET_MAX, +}; + +#define IFLA_STATS_GETSET_MAX (__IFLA_STATS_GETSET_MAX - 1) + /* These are embedded into IFLA_STATS_LINK_XSTATS: * [IFLA_STATS_LINK_XSTATS] * -> [LINK_XSTATS_TYPE_xxx] @@ -1178,10 +1299,21 @@ enum { enum { IFLA_OFFLOAD_XSTATS_UNSPEC, IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO, /* HW stats info. A nest */ + IFLA_OFFLOAD_XSTATS_L3_STATS, /* struct rtnl_hw_stats64 */ __IFLA_OFFLOAD_XSTATS_MAX }; #define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) +enum { + IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC, + IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, /* u8 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, /* u8 */ + __IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX, +}; +#define IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX \ + (__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX - 1) + /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) @@ -1280,4 +1412,14 @@ enum { #define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) +/* DSA section */ + +enum { + IFLA_DSA_UNSPEC, + IFLA_DSA_MASTER, + __IFLA_DSA_MAX, +}; + +#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index a78a809..73a47da 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling xsk core to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped by xsk. + */ +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -106,6 +112,9 @@ struct xdp_desc { __u32 options; }; +/* Flag indicating packet constitutes of multiple buffers*/ +#define XDP_PKT_CONTD (1 << 0) + /* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h new file mode 100644 index 0000000..2943a15 --- /dev/null +++ b/include/uapi/linux/netdev.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_NETDEV_H +#define _UAPI_LINUX_NETDEV_H + +#define NETDEV_FAMILY_NAME "netdev" +#define NETDEV_FAMILY_VERSION 1 + +/** + * enum netdev_xdp_act + * @NETDEV_XDP_ACT_BASIC: XDP features set supported by all drivers + * (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX) + * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT + * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements + * ndo_xdp_xmit callback. + * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP + * in zero copy mode. + * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw + * offloading. + * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear + * XDP buffer support in the driver napi callback. + * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements + * non-linear XDP buffer support in ndo_xdp_xmit callback. + */ +enum netdev_xdp_act { + NETDEV_XDP_ACT_BASIC = 1, + NETDEV_XDP_ACT_REDIRECT = 2, + NETDEV_XDP_ACT_NDO_XMIT = 4, + NETDEV_XDP_ACT_XSK_ZEROCOPY = 8, + NETDEV_XDP_ACT_HW_OFFLOAD = 16, + NETDEV_XDP_ACT_RX_SG = 32, + NETDEV_XDP_ACT_NDO_XMIT_SG = 64, + + /* private: */ + NETDEV_XDP_ACT_MASK = 127, +}; + +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + +enum { + NETDEV_A_DEV_IFINDEX = 1, + NETDEV_A_DEV_PAD, + NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + + __NETDEV_A_DEV_MAX, + NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) +}; + +enum { + NETDEV_CMD_DEV_GET = 1, + NETDEV_CMD_DEV_ADD_NTF, + NETDEV_CMD_DEV_DEL_NTF, + NETDEV_CMD_DEV_CHANGE_NTF, + + __NETDEV_CMD_MAX, + NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) +}; + +#define NETDEV_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_NETDEV_H */ diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h new file mode 100644 index 0000000..a5feb76 --- /dev/null +++ b/include/uapi/linux/openat2.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_OPENAT2_H +#define _UAPI_LINUX_OPENAT2_H + +#include + +/* + * Arguments for how openat2(2) should open the target path. If only @flags and + * @mode are non-zero, then openat2(2) operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in + * -EINVAL rather than being silently ignored. @mode must be zero unless one of + * {O_CREAT, O_TMPFILE} are set. + * + * @flags: O_* flags. + * @mode: O_CREAT/O_TMPFILE file mode. + * @resolve: RESOLVE_* flags. + */ +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; + +/* how->resolve flags for openat2(2). */ +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings + (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style + "magic-links". */ +#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks + (implies OEXT_NO_MAGICLINKS) */ +#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like + "..", symlinks, and absolute + paths which escape the dirfd. */ +#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." + be scoped inside the dirfd + (similar to chroot(2)). */ +#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be + completed through cached lookup. May + return -EAGAIN if that's not + possible. */ + +#endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ccb7f5d..39c6a25 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -374,6 +374,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ #define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -515,6 +516,8 @@ struct perf_event_attr { * truncated accordingly on 32 bit architectures. */ __u64 sig_data; + + __u64 config3; /* extension of config2 */ }; /* @@ -1336,7 +1339,8 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x8 available */ +/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ diff --git a/scripts/build-fuzzers.sh b/scripts/build-fuzzers.sh index a704f47..72de432 100755 --- a/scripts/build-fuzzers.sh +++ b/scripts/build-fuzzers.sh @@ -41,14 +41,14 @@ fi # due to https://bugs.gentoo.org/794601) so let's just point the script to # commits referring to versions of libelf that actually can be built rm -rf elfutils -git clone git://sourceware.org/git/elfutils.git +git clone https://sourceware.org/git/elfutils.git ( cd elfutils -git checkout e9f3045caa5c4498f371383e5519151942d48b6d +git checkout 67a187d4c1790058fc7fd218317851cb68bb087c git log --oneline -1 # ASan isn't compatible with -Wl,--no-undefined: https://github.com/google/sanitizers/issues/380 -find -name Makefile.am | xargs sed -i 's/,--no-undefined//' +sed -i 's/^\(NO_UNDEFINED=\).*/\1/' configure.ac # ASan isn't compatible with -Wl,-z,defs either: # https://clang.llvm.org/docs/AddressSanitizer.html#usage @@ -62,6 +62,7 @@ fi autoreconf -i -f if ! ./configure --enable-maintainer-mode --disable-debuginfod --disable-libdebuginfod \ + --disable-demangler --without-bzlib --without-lzma --without-zstd \ CC="$CC" CFLAGS="-Wno-error $CFLAGS" CXX="$CXX" CXXFLAGS="-Wno-error $CXXFLAGS" LDFLAGS="$CFLAGS"; then cat config.log exit 1 diff --git a/scripts/sync-kernel.sh b/scripts/sync-kernel.sh index eb09ef1..55ffcd1 100755 --- a/scripts/sync-kernel.sh +++ b/scripts/sync-kernel.sh @@ -43,8 +43,10 @@ PATH_MAP=( \ [tools/include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \ [tools/include/uapi/linux/btf.h]=include/uapi/linux/btf.h \ [tools/include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \ + [tools/include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \ [tools/include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \ [tools/include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \ + [tools/include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \ [tools/include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \ [tools/include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \ [tools/include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \ @@ -260,7 +262,7 @@ if ((${COMMIT_CNT} <= 0)); then fi # Exclude baseline commit and generate nice cover letter with summary -git format-patch ${SQUASH_BASE_TAG}..${SQUASH_TIP_TAG} --cover-letter -o ${TMP_DIR}/patches +git format-patch --no-signature ${SQUASH_BASE_TAG}..${SQUASH_TIP_TAG} --cover-letter -o ${TMP_DIR}/patches # Now is time to re-apply libbpf-related linux patches to libbpf repo cd_to ${LIBBPF_REPO} diff --git a/src/Makefile b/src/Makefile index d535f81..35c5724 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,7 +9,7 @@ else endif LIBBPF_MAJOR_VERSION := 1 -LIBBPF_MINOR_VERSION := 1 +LIBBPF_MINOR_VERSION := 3 LIBBPF_PATCH_VERSION := 0 LIBBPF_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).$(LIBBPF_PATCH_VERSION) LIBBPF_MAJMIN_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).0 @@ -35,7 +35,10 @@ ALL_CFLAGS := $(INCLUDES) SHARED_CFLAGS += -fPIC -fvisibility=hidden -DSHARED CFLAGS ?= -g -O2 -Werror -Wall -std=gnu89 -ALL_CFLAGS += $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(EXTRA_CFLAGS) +ALL_CFLAGS += $(CFLAGS) \ + -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 \ + -Wno-unknown-warning-option -Wno-format-overflow \ + $(EXTRA_CFLAGS) ALL_LDFLAGS += $(LDFLAGS) $(EXTRA_LDFLAGS) ifdef NO_PKG_CONFIG @@ -52,7 +55,7 @@ STATIC_OBJDIR := $(OBJDIR)/staticobjs OBJS := bpf.o btf.o libbpf.o libbpf_errno.o netlink.o \ nlattr.o str_error.o libbpf_probes.o bpf_prog_linfo.o \ btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \ - relo_core.o usdt.o + relo_core.o usdt.o zip.o elf.o SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS)) STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS)) diff --git a/src/bpf.c b/src/bpf.c index 9aff98f..9dc9625 100644 --- a/src/bpf.c +++ b/src/bpf.c @@ -230,9 +230,9 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt, int bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name, const char *license, const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts) + struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t attr_sz = offsetofend(union bpf_attr, log_true_size); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; @@ -290,10 +290,6 @@ int bpf_prog_load(enum bpf_prog_type prog_type, if (!!log_buf != !!log_size) return libbpf_err(-EINVAL); - if (log_level > (4 | 2 | 1)) - return libbpf_err(-EINVAL); - if (log_level && !log_buf) - return libbpf_err(-EINVAL); func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0); func_info = OPTS_GET(opts, func_info, NULL); @@ -316,6 +312,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, } fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); if (fd >= 0) return fd; @@ -356,6 +353,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, } fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); if (fd >= 0) goto done; } @@ -370,6 +368,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, attr.log_level = 1; fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); } done: /* free() doesn't affect errno, so we don't need to restore it */ @@ -573,20 +572,30 @@ int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *co (void *)keys, (void *)values, count, opts); } -int bpf_obj_pin(int fd, const char *pathname) +int bpf_obj_pin_opts(int fd, const char *pathname, const struct bpf_obj_pin_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, file_flags); + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); union bpf_attr attr; int ret; + if (!OPTS_VALID(opts, bpf_obj_pin_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); attr.bpf_fd = fd; ret = sys_bpf(BPF_OBJ_PIN, &attr, attr_sz); return libbpf_err_errno(ret); } +int bpf_obj_pin(int fd, const char *pathname) +{ + return bpf_obj_pin_opts(fd, pathname, NULL); +} + int bpf_obj_get(const char *pathname) { return bpf_obj_get_opts(pathname, NULL); @@ -594,7 +603,7 @@ int bpf_obj_get(const char *pathname) int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, file_flags); + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); union bpf_attr attr; int fd; @@ -602,6 +611,7 @@ int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) return libbpf_err(-EINVAL); memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); attr.pathname = ptr_to_u64((void *)pathname); attr.file_flags = OPTS_GET(opts, file_flags, 0); @@ -619,55 +629,89 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, return bpf_prog_attach_opts(prog_fd, target_fd, type, &opts); } -int bpf_prog_attach_opts(int prog_fd, int target_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts) +int bpf_prog_attach_opts(int prog_fd, int target, enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + const size_t attr_sz = offsetofend(union bpf_attr, expected_revision); + __u32 relative_id, flags; + int ret, relative_fd; union bpf_attr attr; - int ret; if (!OPTS_VALID(opts, bpf_prog_attach_opts)) return libbpf_err(-EINVAL); + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + flags = OPTS_GET(opts, flags, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); - attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd; - attr.attach_type = type; - attr.attach_flags = OPTS_GET(opts, flags, 0); - attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); + attr.target_fd = target; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.replace_bpf_fd = OPTS_GET(opts, replace_fd, 0); + attr.expected_revision = OPTS_GET(opts, expected_revision, 0); + + if (relative_id) { + attr.attach_flags = flags | BPF_F_ID; + attr.relative_id = relative_id; + } else { + attr.attach_flags = flags; + attr.relative_fd = relative_fd; + } ret = sys_bpf(BPF_PROG_ATTACH, &attr, attr_sz); return libbpf_err_errno(ret); } -int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +int bpf_prog_detach_opts(int prog_fd, int target, enum bpf_attach_type type, + const struct bpf_prog_detach_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + const size_t attr_sz = offsetofend(union bpf_attr, expected_revision); + __u32 relative_id, flags; + int ret, relative_fd; union bpf_attr attr; - int ret; + + if (!OPTS_VALID(opts, bpf_prog_detach_opts)) + return libbpf_err(-EINVAL); + + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + flags = OPTS_GET(opts, flags, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); memset(&attr, 0, attr_sz); - attr.target_fd = target_fd; - attr.attach_type = type; + attr.target_fd = target; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.expected_revision = OPTS_GET(opts, expected_revision, 0); + + if (relative_id) { + attr.attach_flags = flags | BPF_F_ID; + attr.relative_id = relative_id; + } else { + attr.attach_flags = flags; + attr.relative_fd = relative_fd; + } ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); return libbpf_err_errno(ret); } +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +{ + return bpf_prog_detach_opts(0, target_fd, type, NULL); +} + int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) { - const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); - union bpf_attr attr; - int ret; - - memset(&attr, 0, attr_sz); - attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd; - attr.attach_type = type; - - ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); - return libbpf_err_errno(ret); + return bpf_prog_detach_opts(prog_fd, target_fd, type, NULL); } int bpf_link_create(int prog_fd, int target_fd, @@ -675,9 +719,9 @@ int bpf_link_create(int prog_fd, int target_fd, const struct bpf_link_create_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, link_create); - __u32 target_btf_id, iter_info_len; + __u32 target_btf_id, iter_info_len, relative_id; + int fd, err, relative_fd; union bpf_attr attr; - int fd, err; if (!OPTS_VALID(opts, bpf_link_create_opts)) return libbpf_err(-EINVAL); @@ -723,6 +767,17 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, kprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_UPROBE_MULTI: + attr.link_create.uprobe_multi.flags = OPTS_GET(opts, uprobe_multi.flags, 0); + attr.link_create.uprobe_multi.cnt = OPTS_GET(opts, uprobe_multi.cnt, 0); + attr.link_create.uprobe_multi.path = ptr_to_u64(OPTS_GET(opts, uprobe_multi.path, 0)); + attr.link_create.uprobe_multi.offsets = ptr_to_u64(OPTS_GET(opts, uprobe_multi.offsets, 0)); + attr.link_create.uprobe_multi.ref_ctr_offsets = ptr_to_u64(OPTS_GET(opts, uprobe_multi.ref_ctr_offsets, 0)); + attr.link_create.uprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, uprobe_multi.cookies, 0)); + attr.link_create.uprobe_multi.pid = OPTS_GET(opts, uprobe_multi.pid, 0); + if (!OPTS_ZEROED(opts, uprobe_multi)) + return libbpf_err(-EINVAL); + break; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: @@ -731,6 +786,46 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, tracing)) return libbpf_err(-EINVAL); break; + case BPF_NETFILTER: + attr.link_create.netfilter.pf = OPTS_GET(opts, netfilter.pf, 0); + attr.link_create.netfilter.hooknum = OPTS_GET(opts, netfilter.hooknum, 0); + attr.link_create.netfilter.priority = OPTS_GET(opts, netfilter.priority, 0); + attr.link_create.netfilter.flags = OPTS_GET(opts, netfilter.flags, 0); + if (!OPTS_ZEROED(opts, netfilter)) + return libbpf_err(-EINVAL); + break; + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + relative_fd = OPTS_GET(opts, tcx.relative_fd, 0); + relative_id = OPTS_GET(opts, tcx.relative_id, 0); + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + if (relative_id) { + attr.link_create.tcx.relative_id = relative_id; + attr.link_create.flags |= BPF_F_ID; + } else { + attr.link_create.tcx.relative_fd = relative_fd; + } + attr.link_create.tcx.expected_revision = OPTS_GET(opts, tcx.expected_revision, 0); + if (!OPTS_ZEROED(opts, tcx)) + return libbpf_err(-EINVAL); + break; + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + relative_fd = OPTS_GET(opts, netkit.relative_fd, 0); + relative_id = OPTS_GET(opts, netkit.relative_id, 0); + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + if (relative_id) { + attr.link_create.netkit.relative_id = relative_id; + attr.link_create.flags |= BPF_F_ID; + } else { + attr.link_create.netkit.relative_fd = relative_fd; + } + attr.link_create.netkit.expected_revision = OPTS_GET(opts, netkit.expected_revision, 0); + if (!OPTS_ZEROED(opts, netkit)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); @@ -794,11 +889,17 @@ int bpf_link_update(int link_fd, int new_prog_fd, if (!OPTS_VALID(opts, bpf_link_update_opts)) return libbpf_err(-EINVAL); + if (OPTS_GET(opts, old_prog_fd, 0) && OPTS_GET(opts, old_map_fd, 0)) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); attr.link_update.link_fd = link_fd; attr.link_update.new_prog_fd = new_prog_fd; attr.link_update.flags = OPTS_GET(opts, flags, 0); - attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + if (OPTS_GET(opts, old_prog_fd, 0)) + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + else if (OPTS_GET(opts, old_map_fd, 0)) + attr.link_update.old_map_fd = OPTS_GET(opts, old_map_fd, 0); ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz); return libbpf_err_errno(ret); @@ -817,8 +918,7 @@ int bpf_iter_create(int link_fd) return libbpf_err_errno(fd); } -int bpf_prog_query_opts(int target_fd, - enum bpf_attach_type type, +int bpf_prog_query_opts(int target, enum bpf_attach_type type, struct bpf_prog_query_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, query); @@ -829,18 +929,20 @@ int bpf_prog_query_opts(int target_fd, return libbpf_err(-EINVAL); memset(&attr, 0, attr_sz); - - attr.query.target_fd = target_fd; - attr.query.attach_type = type; - attr.query.query_flags = OPTS_GET(opts, query_flags, 0); - attr.query.prog_cnt = OPTS_GET(opts, prog_cnt, 0); - attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); - attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); + attr.query.target_fd = target; + attr.query.attach_type = type; + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.count = OPTS_GET(opts, count, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.link_ids = ptr_to_u64(OPTS_GET(opts, link_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); + attr.query.link_attach_flags = ptr_to_u64(OPTS_GET(opts, link_attach_flags, NULL)); ret = sys_bpf(BPF_PROG_QUERY, &attr, attr_sz); OPTS_SET(opts, attach_flags, attr.query.attach_flags); - OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); + OPTS_SET(opts, revision, attr.query.revision); + OPTS_SET(opts, count, attr.query.count); return libbpf_err_errno(ret); } @@ -1044,6 +1146,26 @@ int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) return libbpf_err_errno(err); } +int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(prog_fd, info, info_len); +} + +int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(map_fd, info, info_len); +} + +int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(btf_fd, info, info_len); +} + +int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(link_fd, info, info_len); +} + int bpf_raw_tracepoint_open(const char *name, int prog_fd) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); @@ -1058,9 +1180,9 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) return libbpf_err_errno(fd); } -int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts) +int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, btf_log_level); + const size_t attr_sz = offsetofend(union bpf_attr, btf_log_true_size); union bpf_attr attr; char *log_buf; size_t log_size; @@ -1103,6 +1225,8 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_loa attr.btf_log_level = 1; fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); } + + OPTS_SET(opts, log_true_size, attr.btf_log_true_size); return libbpf_err_errno(fd); } diff --git a/src/bpf.h b/src/bpf.h index 7468978..d0f5377 100644 --- a/src/bpf.h +++ b/src/bpf.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* - * common eBPF ELF operations. + * Common BPF ELF operations. * * Copyright (C) 2013-2015 Alexei Starovoitov * Copyright (C) 2015 Wang Nan @@ -96,13 +96,20 @@ struct bpf_prog_load_opts { __u32 log_level; __u32 log_size; char *log_buf; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + size_t :0; }; -#define bpf_prog_load_opts__last_field log_buf +#define bpf_prog_load_opts__last_field log_true_size LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name, const char *license, const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts); + struct bpf_prog_load_opts *opts); /* Flags to direct loading requirements */ #define MAPS_RELAX_COMPAT 0x01 @@ -117,11 +124,18 @@ struct bpf_btf_load_opts { char *log_buf; __u32 log_level; __u32 log_size; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + size_t :0; }; -#define bpf_btf_load_opts__last_field log_size +#define bpf_btf_load_opts__last_field log_true_size LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, - const struct bpf_btf_load_opts *opts); + struct bpf_btf_load_opts *opts); LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); @@ -270,36 +284,96 @@ LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values __u32 *count, const struct bpf_map_batch_opts *opts); +struct bpf_obj_pin_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + int path_fd; + + size_t :0; +}; +#define bpf_obj_pin_opts__last_field path_fd + +LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); +LIBBPF_API int bpf_obj_pin_opts(int fd, const char *pathname, + const struct bpf_obj_pin_opts *opts); + struct bpf_obj_get_opts { size_t sz; /* size of this struct for forward/backward compatibility */ __u32 file_flags; + int path_fd; size_t :0; }; -#define bpf_obj_get_opts__last_field file_flags +#define bpf_obj_get_opts__last_field path_fd -LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); LIBBPF_API int bpf_obj_get(const char *pathname); LIBBPF_API int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts); -struct bpf_prog_attach_opts { - size_t sz; /* size of this struct for forward/backward compatibility */ - unsigned int flags; - int replace_prog_fd; -}; -#define bpf_prog_attach_opts__last_field replace_prog_fd - LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); -LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); +struct bpf_prog_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + union { + int replace_prog_fd; + int replace_fd; + }; + int relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_prog_attach_opts__last_field expected_revision + +struct bpf_prog_detach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + int relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_prog_detach_opts__last_field expected_revision + +/** + * @brief **bpf_prog_attach_opts()** attaches the BPF program corresponding to + * *prog_fd* to a *target* which can represent a file descriptor or netdevice + * ifindex. + * + * @param prog_fd BPF program file descriptor + * @param target attach location file descriptor or ifindex + * @param type attach type for the BPF program + * @param opts options for configuring the attachment + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int target, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts); + +/** + * @brief **bpf_prog_detach_opts()** detaches the BPF program corresponding to + * *prog_fd* from a *target* which can represent a file descriptor or netdevice + * ifindex. + * + * @param prog_fd BPF program file descriptor + * @param target detach location file descriptor or ifindex + * @param type detach type for the BPF program + * @param opts options for configuring the detachment + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_detach_opts(int prog_fd, int target, + enum bpf_attach_type type, + const struct bpf_prog_detach_opts *opts); + union bpf_iter_link_info; /* defined in up-to-date linux/bpf.h */ struct bpf_link_create_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -318,13 +392,38 @@ struct bpf_link_create_opts { const unsigned long *addrs; const __u64 *cookies; } kprobe_multi; + struct { + __u32 flags; + __u32 cnt; + const char *path; + const unsigned long *offsets; + const unsigned long *ref_ctr_offsets; + const __u64 *cookies; + __u32 pid; + } uprobe_multi; struct { __u64 cookie; } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + struct { + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + } tcx; + struct { + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + } netkit; }; size_t :0; }; -#define bpf_link_create_opts__last_field kprobe_multi.cookies +#define bpf_link_create_opts__last_field uprobe_multi.pid LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, enum bpf_attach_type attach_type, @@ -336,8 +435,9 @@ struct bpf_link_update_opts { size_t sz; /* size of this struct for forward/backward compatibility */ __u32 flags; /* extra flags */ __u32 old_prog_fd; /* expected old program FD */ + __u32 old_map_fd; /* expected old map FD */ }; -#define bpf_link_update_opts__last_field old_prog_fd +#define bpf_link_update_opts__last_field old_map_fd LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, const struct bpf_link_update_opts *opts); @@ -387,18 +487,104 @@ LIBBPF_API int bpf_link_get_fd_by_id_opts(__u32 id, const struct bpf_get_fd_by_id_opts *opts); LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); +/** + * @brief **bpf_prog_get_info_by_fd()** obtains information about the BPF + * program corresponding to *prog_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param prog_fd BPF program file descriptor + * @param info pointer to **struct bpf_prog_info** that will be populated with + * BPF program information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len); + +/** + * @brief **bpf_map_get_info_by_fd()** obtains information about the BPF + * map corresponding to *map_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param map_fd BPF map file descriptor + * @param info pointer to **struct bpf_map_info** that will be populated with + * BPF map information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the + * BTF object corresponding to *btf_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param btf_fd BTF object file descriptor + * @param info pointer to **struct bpf_btf_info** that will be populated with + * BTF object information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the BPF + * link corresponding to *link_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param link_fd BPF link file descriptor + * @param info pointer to **struct bpf_link_info** that will be populated with + * BPF link information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len); + struct bpf_prog_query_opts { size_t sz; /* size of this struct for forward/backward compatibility */ __u32 query_flags; __u32 attach_flags; /* output argument */ __u32 *prog_ids; - __u32 prog_cnt; /* input+output argument */ + union { + /* input+output argument */ + __u32 prog_cnt; + __u32 count; + }; __u32 *prog_attach_flags; + __u32 *link_ids; + __u32 *link_attach_flags; + __u64 revision; + size_t :0; }; -#define bpf_prog_query_opts__last_field prog_attach_flags +#define bpf_prog_query_opts__last_field revision -LIBBPF_API int bpf_prog_query_opts(int target_fd, - enum bpf_attach_type type, +/** + * @brief **bpf_prog_query_opts()** queries the BPF programs and BPF links + * which are attached to *target* which can represent a file descriptor or + * netdevice ifindex. + * + * @param target query location file descriptor or ifindex + * @param type attach type for the BPF program + * @param opts options for configuring the query + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_query_opts(int target, enum bpf_attach_type type, struct bpf_prog_query_opts *opts); LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, diff --git a/src/bpf_core_read.h b/src/bpf_core_read.h index 496e6a8..1ac57bb 100644 --- a/src/bpf_core_read.h +++ b/src/bpf_core_read.h @@ -364,7 +364,7 @@ enum bpf_enum_value_kind { /* Non-CO-RE variant of BPF_CORE_READ_INTO() */ #define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ - ___core_read(bpf_probe_read, bpf_probe_read, \ + ___core_read(bpf_probe_read_kernel, bpf_probe_read_kernel, \ dst, (src), a, ##__VA_ARGS__) \ }) @@ -400,7 +400,7 @@ enum bpf_enum_value_kind { /* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ #define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ - ___core_read(bpf_probe_read_str, bpf_probe_read, \ + ___core_read(bpf_probe_read_kernel_str, bpf_probe_read_kernel, \ dst, (src), a, ##__VA_ARGS__) \ }) diff --git a/src/bpf_gen_internal.h b/src/bpf_gen_internal.h index 2233089..fdf4440 100644 --- a/src/bpf_gen_internal.h +++ b/src/bpf_gen_internal.h @@ -11,6 +11,7 @@ struct ksym_relo_desc { int insn_idx; bool is_weak; bool is_typeless; + bool is_ld64; }; struct ksym_desc { @@ -24,6 +25,7 @@ struct ksym_desc { bool typeless; }; int insn; + bool is_ld64; }; struct bpf_gen { @@ -65,7 +67,7 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx); void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type); void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, - bool is_typeless, int kind, int insn_idx); + bool is_typeless, bool is_ld64, int kind, int insn_idx); void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx); diff --git a/src/bpf_helper_defs.h b/src/bpf_helper_defs.h index 9bcaca1..90eec74 100644 --- a/src/bpf_helper_defs.h +++ b/src/bpf_helper_defs.h @@ -118,17 +118,17 @@ static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; * * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) - * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * to file *\/sys/kernel/tracing/trace* from TraceFS, if * available. It can take up to three additional **u64** * arguments (as an eBPF helpers, the total number of arguments is * limited to five). * * Each time the helper is called, it appends a line to the trace. - * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is - * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. * The format of the trace is customizable, and the exact output * one will get depends on the options set in - * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *\/sys/kernel/tracing/trace_options* (see also the * *README* file under the same directory). However, it usually * defaults to something like: * @@ -350,7 +350,9 @@ static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (vo * direct packet access. * * Returns - * 0 on success, or a negative error in case of failure. + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. */ static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; @@ -1204,8 +1206,8 @@ static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -1277,6 +1279,11 @@ static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *op * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -1445,7 +1452,7 @@ static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf /* * bpf_perf_prog_read_value * - * For en eBPF program attached to a perf event, retrieve the + * For an eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in * the structure pointed by *buf* and of size *buf_size*. Enabled * and running times are also stored in the structure (see @@ -1470,8 +1477,8 @@ static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by @@ -1827,9 +1834,23 @@ static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void * * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -3019,9 +3040,6 @@ static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void * * * **-EOPNOTSUPP** if the operation is not supported, for example * a call from outside of TC ingress. - * - * **-ESOCKTNOSUPPORT** if the socket type is not supported - * (reuseport). */ static long (*bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; @@ -3304,6 +3322,8 @@ static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; * bpf_get_task_stack * * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -3315,6 +3335,7 @@ static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. @@ -4018,6 +4039,14 @@ static long (*bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. + * * * Returns * 0 on success. @@ -4045,9 +4074,14 @@ static long (*bpf_timer_cancel)(struct bpf_timer *timer) = (void *) 172; * * Get address of the traced function (for tracing and kprobe programs). * + * When called for kprobe program attached as uprobe it returns + * probe address for both entry and return uprobe. + * + * * Returns - * Address of the traced function. + * Address of the traced function for kprobe. * 0 for kprobes placed within the function (not at the entry). + * Address of the probe for uprobe and return uprobe. */ static __u64 (*bpf_get_func_ip)(void *ctx) = (void *) 173; @@ -4498,12 +4532,23 @@ static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *sr * * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. - * *flags* is currently unused. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). * * Returns * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr or if *flags* is not 0. + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). */ static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202; @@ -4515,6 +4560,9 @@ static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * * Returns * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length diff --git a/src/bpf_helpers.h b/src/bpf_helpers.h index d37c4fe..77ceea5 100644 --- a/src/bpf_helpers.h +++ b/src/bpf_helpers.h @@ -77,16 +77,21 @@ /* * Helper macros to manipulate data structures */ -#ifndef offsetof -#define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) -#endif -#ifndef container_of + +/* offsetof() definition that uses __builtin_offset() might not preserve field + * offset CO-RE relocation properly, so force-redefine offsetof() using + * old-school approach which works with CO-RE correctly + */ +#undef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) + +/* redefined container_of() to ensure we use the above offsetof() macro */ +#undef container_of #define container_of(ptr, type, member) \ ({ \ void *__mptr = (void *)(ptr); \ ((type *)(__mptr - offsetof(type, member))); \ }) -#endif /* * Compiler (optimization) barrier. @@ -109,7 +114,7 @@ * This is a variable-specific variant of more global barrier(). */ #ifndef barrier_var -#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) +#define barrier_var(var) asm volatile("" : "+r"(var)) #endif /* @@ -174,8 +179,14 @@ enum libbpf_tristate { #define __kconfig __attribute__((section(".kconfig"))) #define __ksym __attribute__((section(".ksyms"))) +#define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) #define __kptr __attribute__((btf_type_tag("kptr"))) -#define __kptr_ref __attribute__((btf_type_tag("kptr_ref"))) +#define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) + +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ + !!sym; \ +}) #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b @@ -286,4 +297,107 @@ enum libbpf_tristate { /* Helper macro to print out debug messages */ #define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) +struct bpf_iter_num; + +extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak __ksym; +extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym; +extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym; + +#ifndef bpf_for_each +/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for + * using BPF open-coded iterators without having to write mundane explicit + * low-level loop logic. Instead, it provides for()-like generic construct + * that can be used pretty naturally. E.g., for some hypothetical cgroup + * iterator, you'd write: + * + * struct cgroup *cg, *parent_cg = <...>; + * + * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) { + * bpf_printk("Child cgroup id = %d", cg->cgroup_id); + * if (cg->cgroup_id == 123) + * break; + * } + * + * I.e., it looks almost like high-level for each loop in other languages, + * supports continue/break, and is verifiable by BPF verifier. + * + * For iterating integers, the difference betwen bpf_for_each(num, i, N, M) + * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to + * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int + * *`, not just `int`. So for integers bpf_for() is more convenient. + * + * Note: this macro relies on C99 feature of allowing to declare variables + * inside for() loop, bound to for() loop lifetime. It also utilizes GCC + * extension: __attribute__((cleanup())), supported by both GCC and + * Clang. + */ +#define bpf_for_each(type, cur, args...) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */, \ + cleanup(bpf_iter_##type##_destroy))), \ + /* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_##type##_new(&___it, ##args), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_##type##_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_##type##_destroy, (void *)0); \ + /* iteration and termination check */ \ + (((cur) = bpf_iter_##type##_next(&___it))); \ +) +#endif /* bpf_for_each */ + +#ifndef bpf_for +/* bpf_for(i, start, end) implements a for()-like looping construct that sets + * provided integer variable *i* to values starting from *start* through, + * but not including, *end*. It also proves to BPF verifier that *i* belongs + * to range [start, end), so this can be used for accessing arrays without + * extra checks. + * + * Note: *start* and *end* are assumed to be expressions with no side effects + * and whose values do not change throughout bpf_for() loop execution. They do + * not have to be statically known or constant, though. + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_for(i, start, end) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, (start), (end)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + ({ \ + /* iteration step */ \ + int *___t = bpf_iter_num_next(&___it); \ + /* termination and bounds check */ \ + (___t && ((i) = *___t, (i) >= (start) && (i) < (end))); \ + }); \ +) +#endif /* bpf_for */ + +#ifndef bpf_repeat +/* bpf_repeat(N) performs N iterations without exposing iteration number + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_repeat(N) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, 0, (N)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + bpf_iter_num_next(&___it); \ + /* nothing here */ \ +) +#endif /* bpf_repeat */ + #endif diff --git a/src/bpf_tracing.h b/src/bpf_tracing.h index 3ea2b47..1c13f8e 100644 --- a/src/bpf_tracing.h +++ b/src/bpf_tracing.h @@ -32,6 +32,9 @@ #elif defined(__TARGET_ARCH_arc) #define bpf_target_arc #define bpf_target_defined +#elif defined(__TARGET_ARCH_loongarch) + #define bpf_target_loongarch + #define bpf_target_defined #else /* Fall back to what the compiler says */ @@ -62,6 +65,9 @@ #elif defined(__arc__) #define bpf_target_arc #define bpf_target_defined +#elif defined(__loongarch__) + #define bpf_target_loongarch + #define bpf_target_defined #endif /* no compiler target */ #endif @@ -72,6 +78,10 @@ #if defined(bpf_target_x86) +/* + * https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI + */ + #if defined(__KERNEL__) || defined(__VMLINUX_H__) #define __PT_PARM1_REG di @@ -79,25 +89,40 @@ #define __PT_PARM3_REG dx #define __PT_PARM4_REG cx #define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 +/* + * Syscall uses r10 for PARM4. See arch/x86/entry/entry_64.S:entry_SYSCALL_64 + * comments in Linux sources. And refer to syscall(2) manpage. + */ +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG sp #define __PT_FP_REG bp #define __PT_RC_REG ax #define __PT_SP_REG sp #define __PT_IP_REG ip -/* syscall uses r10 for PARM4 */ -#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10) -#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10) #else #ifdef __i386__ +/* i386 kernel is built with -mregparm=3 */ #define __PT_PARM1_REG eax #define __PT_PARM2_REG edx #define __PT_PARM3_REG ecx -/* i386 kernel is built with -mregparm=3 */ -#define __PT_PARM4_REG __unsupported__ -#define __PT_PARM5_REG __unsupported__ +/* i386 syscall ABI is very different, refer to syscall(2) manpage */ +#define __PT_PARM1_SYSCALL_REG ebx +#define __PT_PARM2_SYSCALL_REG ecx +#define __PT_PARM3_SYSCALL_REG edx +#define __PT_PARM4_SYSCALL_REG esi +#define __PT_PARM5_SYSCALL_REG edi +#define __PT_PARM6_SYSCALL_REG ebp + #define __PT_RET_REG esp #define __PT_FP_REG ebp #define __PT_RC_REG eax @@ -111,14 +136,20 @@ #define __PT_PARM3_REG rdx #define __PT_PARM4_REG rcx #define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG rsp #define __PT_FP_REG rbp #define __PT_RC_REG rax #define __PT_SP_REG rsp #define __PT_IP_REG rip -/* syscall uses r10 for PARM4 */ -#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10) -#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10) #endif /* __i386__ */ @@ -126,6 +157,10 @@ #elif defined(bpf_target_s390) +/* + * https://github.com/IBM/s390x-abi/releases/download/v1.6/lzsabi_s390x.pdf + */ + struct pt_regs___s390 { unsigned long orig_gpr2; }; @@ -137,21 +172,42 @@ struct pt_regs___s390 { #define __PT_PARM3_REG gprs[4] #define __PT_PARM4_REG gprs[5] #define __PT_PARM5_REG gprs[6] -#define __PT_RET_REG grps[14] + +#define __PT_PARM1_SYSCALL_REG orig_gpr2 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG gprs[7] +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG) + +#define __PT_RET_REG gprs[14] #define __PT_FP_REG gprs[11] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG gprs[2] #define __PT_SP_REG gprs[15] #define __PT_IP_REG psw.addr -#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) -#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___s390 *)(x), orig_gpr2) #elif defined(bpf_target_arm) +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs32/aapcs32.rst#machine-registers + */ + #define __PT_PARM1_REG uregs[0] #define __PT_PARM2_REG uregs[1] #define __PT_PARM3_REG uregs[2] #define __PT_PARM4_REG uregs[3] -#define __PT_PARM5_REG uregs[4] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG uregs[4] +#define __PT_PARM6_SYSCALL_REG uregs[5] +#define __PT_PARM7_SYSCALL_REG uregs[6] + #define __PT_RET_REG uregs[14] #define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG uregs[0] @@ -160,6 +216,10 @@ struct pt_regs___s390 { #elif defined(bpf_target_arm64) +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#machine-registers + */ + struct pt_regs___arm64 { unsigned long orig_x0; }; @@ -172,21 +232,48 @@ struct pt_regs___arm64 { #define __PT_PARM4_REG regs[3] #define __PT_PARM5_REG regs[4] #define __PT_PARM6_REG regs[5] +#define __PT_PARM7_REG regs[6] +#define __PT_PARM8_REG regs[7] + +#define __PT_PARM1_SYSCALL_REG orig_x0 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG) + #define __PT_RET_REG regs[30] #define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG regs[0] #define __PT_SP_REG sp #define __PT_IP_REG pc -#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) -#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___arm64 *)(x), orig_x0) #elif defined(bpf_target_mips) +/* + * N64 ABI is assumed right now. + * https://en.wikipedia.org/wiki/MIPS_architecture#Calling_conventions + */ + #define __PT_PARM1_REG regs[4] #define __PT_PARM2_REG regs[5] #define __PT_PARM3_REG regs[6] #define __PT_PARM4_REG regs[7] #define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG /* only N32/N64 */ +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG /* only N32/N64 */ + #define __PT_RET_REG regs[31] #define __PT_FP_REG regs[30] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG regs[2] @@ -195,26 +282,58 @@ struct pt_regs___arm64 { #elif defined(bpf_target_powerpc) +/* + * http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf (page 3-14, + * section "Function Calling Sequence") + */ + #define __PT_PARM1_REG gpr[3] #define __PT_PARM2_REG gpr[4] #define __PT_PARM3_REG gpr[5] #define __PT_PARM4_REG gpr[6] #define __PT_PARM5_REG gpr[7] +#define __PT_PARM6_REG gpr[8] +#define __PT_PARM7_REG gpr[9] +#define __PT_PARM8_REG gpr[10] + +/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG orig_gpr3 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#if !defined(__arch64__) +#define __PT_PARM7_SYSCALL_REG __PT_PARM7_REG /* only powerpc (not powerpc64) */ +#endif + #define __PT_RET_REG regs[31] #define __PT_FP_REG __unsupported__ #define __PT_RC_REG gpr[3] #define __PT_SP_REG sp #define __PT_IP_REG nip -/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ -#define PT_REGS_SYSCALL_REGS(ctx) ctx #elif defined(bpf_target_sparc) +/* + * https://en.wikipedia.org/wiki/Calling_convention#SPARC + */ + #define __PT_PARM1_REG u_regs[UREG_I0] #define __PT_PARM2_REG u_regs[UREG_I1] #define __PT_PARM3_REG u_regs[UREG_I2] #define __PT_PARM4_REG u_regs[UREG_I3] #define __PT_PARM5_REG u_regs[UREG_I4] +#define __PT_PARM6_REG u_regs[UREG_I5] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG u_regs[UREG_I7] #define __PT_FP_REG __unsupported__ #define __PT_RC_REG u_regs[UREG_I0] @@ -228,6 +347,11 @@ struct pt_regs___arm64 { #elif defined(bpf_target_riscv) +/* + * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions + */ + +/* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) #define __PT_PARM1_REG a0 #define __PT_PARM2_REG a1 @@ -235,30 +359,87 @@ struct pt_regs___arm64 { #define __PT_PARM4_REG a3 #define __PT_PARM5_REG a4 #define __PT_PARM6_REG a5 +#define __PT_PARM7_REG a6 +#define __PT_PARM8_REG a7 + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG ra #define __PT_FP_REG s0 #define __PT_RC_REG a0 #define __PT_SP_REG sp #define __PT_IP_REG pc -/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ -#define PT_REGS_SYSCALL_REGS(ctx) ctx #elif defined(bpf_target_arc) -/* arc provides struct user_pt_regs instead of struct pt_regs to userspace */ +/* + * Section "Function Calling Sequence" (page 24): + * https://raw.githubusercontent.com/wiki/foss-for-synopsys-dwc-arc-processors/toolchain/files/ARCv2_ABI.pdf + */ + +/* arc provides struct user_regs_struct instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) #define __PT_PARM1_REG scratch.r0 #define __PT_PARM2_REG scratch.r1 #define __PT_PARM3_REG scratch.r2 #define __PT_PARM4_REG scratch.r3 #define __PT_PARM5_REG scratch.r4 +#define __PT_PARM6_REG scratch.r5 +#define __PT_PARM7_REG scratch.r6 +#define __PT_PARM8_REG scratch.r7 + +/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG scratch.blink -#define __PT_FP_REG __unsupported__ +#define __PT_FP_REG scratch.fp #define __PT_RC_REG scratch.r0 #define __PT_SP_REG scratch.sp #define __PT_IP_REG scratch.ret -/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ + +#elif defined(bpf_target_loongarch) + +/* + * https://docs.kernel.org/loongarch/introduction.html + * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html + */ + +/* loongarch provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */ #define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG regs[1] +#define __PT_FP_REG regs[22] +#define __PT_RC_REG regs[4] +#define __PT_SP_REG regs[3] +#define __PT_IP_REG csr_era #endif @@ -266,17 +447,49 @@ struct pt_regs___arm64 { struct pt_regs; -/* allow some architecutres to override `struct pt_regs` */ +/* allow some architectures to override `struct pt_regs` */ #ifndef __PT_REGS_CAST #define __PT_REGS_CAST(x) (x) #endif +/* + * Different architectures support different number of arguments passed + * through registers. i386 supports just 3, some arches support up to 8. + */ +#ifndef __PT_PARM4_REG +#define __PT_PARM4_REG __unsupported__ +#endif +#ifndef __PT_PARM5_REG +#define __PT_PARM5_REG __unsupported__ +#endif +#ifndef __PT_PARM6_REG +#define __PT_PARM6_REG __unsupported__ +#endif +#ifndef __PT_PARM7_REG +#define __PT_PARM7_REG __unsupported__ +#endif +#ifndef __PT_PARM8_REG +#define __PT_PARM8_REG __unsupported__ +#endif +/* + * Similarly, syscall-specific conventions might differ between function call + * conventions within each architecutre. All supported architectures pass + * either 6 or 7 syscall arguments in registers. + * + * See syscall(2) manpage for succinct table with information on each arch. + */ +#ifndef __PT_PARM7_SYSCALL_REG +#define __PT_PARM7_SYSCALL_REG __unsupported__ +#endif + #define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) #define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) #define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) #define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG) #define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) #define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) +#define PT_REGS_PARM7(x) (__PT_REGS_CAST(x)->__PT_PARM7_REG) +#define PT_REGS_PARM8(x) (__PT_REGS_CAST(x)->__PT_PARM8_REG) #define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) #define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) #define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) @@ -288,6 +501,9 @@ struct pt_regs; #define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG) #define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG) #define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG) +#define PT_REGS_PARM6_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_REG) +#define PT_REGS_PARM7_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_REG) +#define PT_REGS_PARM8_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM8_REG) #define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG) #define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG) #define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG) @@ -314,24 +530,33 @@ struct pt_regs; #endif #ifndef PT_REGS_PARM1_SYSCALL -#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1(x) +#define PT_REGS_PARM1_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM1_SYSCALL_REG) +#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM2_SYSCALL +#define PT_REGS_PARM2_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM2_SYSCALL_REG) +#define PT_REGS_PARM2_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM3_SYSCALL +#define PT_REGS_PARM3_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM3_SYSCALL_REG) +#define PT_REGS_PARM3_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_SYSCALL_REG) #endif -#define PT_REGS_PARM2_SYSCALL(x) PT_REGS_PARM2(x) -#define PT_REGS_PARM3_SYSCALL(x) PT_REGS_PARM3(x) #ifndef PT_REGS_PARM4_SYSCALL -#define PT_REGS_PARM4_SYSCALL(x) PT_REGS_PARM4(x) +#define PT_REGS_PARM4_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM4_SYSCALL_REG) +#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_SYSCALL_REG) #endif -#define PT_REGS_PARM5_SYSCALL(x) PT_REGS_PARM5(x) - -#ifndef PT_REGS_PARM1_CORE_SYSCALL -#define PT_REGS_PARM1_CORE_SYSCALL(x) PT_REGS_PARM1_CORE(x) +#ifndef PT_REGS_PARM5_SYSCALL +#define PT_REGS_PARM5_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM5_SYSCALL_REG) +#define PT_REGS_PARM5_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_SYSCALL_REG) #endif -#define PT_REGS_PARM2_CORE_SYSCALL(x) PT_REGS_PARM2_CORE(x) -#define PT_REGS_PARM3_CORE_SYSCALL(x) PT_REGS_PARM3_CORE(x) -#ifndef PT_REGS_PARM4_CORE_SYSCALL -#define PT_REGS_PARM4_CORE_SYSCALL(x) PT_REGS_PARM4_CORE(x) +#ifndef PT_REGS_PARM6_SYSCALL +#define PT_REGS_PARM6_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM6_SYSCALL_REG) +#define PT_REGS_PARM6_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM7_SYSCALL +#define PT_REGS_PARM7_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM7_SYSCALL_REG) +#define PT_REGS_PARM7_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_SYSCALL_REG) #endif -#define PT_REGS_PARM5_CORE_SYSCALL(x) PT_REGS_PARM5_CORE(x) #else /* defined(bpf_target_defined) */ @@ -341,6 +566,8 @@ struct pt_regs; #define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM6(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) @@ -352,6 +579,9 @@ struct pt_regs; #define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) @@ -366,12 +596,16 @@ struct pt_regs; #define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #endif /* defined(bpf_target_defined) */ @@ -558,6 +792,8 @@ struct pt_regs; #define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) #define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) #define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) #define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* @@ -614,6 +850,8 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) #define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) #define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) #define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ @@ -623,6 +861,8 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) #define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) #define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) #define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) #define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) /* @@ -672,4 +912,11 @@ ____##name(struct pt_regs *ctx, ##args) #define BPF_KPROBE_SYSCALL BPF_KSYSCALL +/* BPF_UPROBE and BPF_URETPROBE are identical to BPF_KPROBE and BPF_KRETPROBE, + * but are named way less confusingly for SEC("uprobe") and SEC("uretprobe") + * use cases. + */ +#define BPF_UPROBE(name, args...) BPF_KPROBE(name, ##args) +#define BPF_URETPROBE(name, args...) BPF_KRETPROBE(name, ##args) + #endif diff --git a/src/btf.c b/src/btf.c index c80a5d2..f77b637 100644 --- a/src/btf.c +++ b/src/btf.c @@ -20,17 +20,6 @@ #include #endif -#ifdef HAVE_ELFIO -#include "elfio_c_wrapper.h" - -typedef struct Elf64_Ehdr Elf64_Ehdr; -typedef struct Elf64_Shdr Elf64_Shdr; -typedef struct { - void *d_buf; - size_t d_size; -} Elf_Data; -#endif - #include "btf.h" #include "bpf.h" #include "libbpf.h" @@ -463,6 +452,165 @@ static int btf_parse_type_sec(struct btf *btf) return 0; } +static int btf_validate_str(const struct btf *btf, __u32 str_off, const char *what, __u32 type_id) +{ + const char *s; + + s = btf__str_by_offset(btf, str_off); + if (!s) { + pr_warn("btf: type [%u]: invalid %s (string offset %u)\n", type_id, what, str_off); + return -EINVAL; + } + + return 0; +} + +static int btf_validate_id(const struct btf *btf, __u32 id, __u32 ctx_id) +{ + const struct btf_type *t; + + t = btf__type_by_id(btf, id); + if (!t) { + pr_warn("btf: type [%u]: invalid referenced type ID %u\n", ctx_id, id); + return -EINVAL; + } + + return 0; +} + +static int btf_validate_type(const struct btf *btf, const struct btf_type *t, __u32 id) +{ + __u32 kind = btf_kind(t); + int err, i, n; + + err = btf_validate_str(btf, t->name_off, "type name", id); + if (err) + return err; + + switch (kind) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + break; + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + err = btf_validate_id(btf, t->type, id); + if (err) + return err; + break; + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + err = btf_validate_id(btf, a->type, id); + err = err ?: btf_validate_id(btf, a->index_type, id); + if (err) + return err; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "field name", id); + err = err ?: btf_validate_id(btf, m->type, id); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + const struct btf_enum *m = btf_enum(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "enum name", id); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM64: { + const struct btf_enum64 *m = btf_enum64(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "enum name", id); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC: { + const struct btf_type *ft; + + err = btf_validate_id(btf, t->type, id); + if (err) + return err; + ft = btf__type_by_id(btf, t->type); + if (btf_kind(ft) != BTF_KIND_FUNC_PROTO) { + pr_warn("btf: type [%u]: referenced type [%u] is not FUNC_PROTO\n", id, t->type); + return -EINVAL; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m = btf_params(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "param name", id); + err = err ?: btf_validate_id(btf, m->type, id); + if (err) + return err; + } + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *m = btf_var_secinfos(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_id(btf, m->type, id); + if (err) + return err; + } + break; + } + default: + pr_warn("btf: type [%u]: unrecognized kind %u\n", id, kind); + return -EINVAL; + } + return 0; +} + +/* Validate basic sanity of BTF. It's intentionally less thorough than + * kernel's validation and validates only properties of BTF that libbpf relies + * on to be correct (e.g., valid type IDs, valid string offsets, etc) + */ +static int btf_sanity_check(const struct btf *btf) +{ + const struct btf_type *t; + __u32 i, n = btf__type_cnt(btf); + int err; + + for (i = 1; i < n; i++) { + t = btf_type_by_id(btf, i); + err = btf_validate_type(btf, t, i); + if (err) + return err; + } + return 0; +} + __u32 btf__type_cnt(const struct btf *btf) { return btf->start_id + btf->nr_types; @@ -917,6 +1065,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) err = btf_parse_str_sec(btf); err = err ?: btf_parse_type_sec(btf); + err = err ?: btf_sanity_check(btf); if (err) goto done; @@ -958,7 +1107,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { Elf_Data *btf_data = NULL, *btf_ext_data = NULL; - int err = 0, idx = 0; + int err = 0, fd = -1, idx = 0; struct btf *btf = NULL; #ifdef HAVE_LIBELF Elf_Scn *scn = NULL; @@ -978,7 +1127,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, } #endif - int fd = open(path, O_RDONLY | O_CLOEXEC); + fd = open(path, O_RDONLY | O_CLOEXEC); if (fd < 0) { err = -errno; pr_warn("failed to open %s: %s\n", path, strerror(errno)); @@ -1105,7 +1254,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, if (!btf_data) { pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); - err = -ENOENT; + err = -ENODATA; goto done; } btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); @@ -1177,7 +1326,7 @@ static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) int err = 0; long sz; - f = fopen(path, "rb"); + f = fopen(path, "rbe"); if (!f) { err = -errno; goto err_out; @@ -1461,9 +1610,9 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) void *ptr; int err; - /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so + /* we won't know btf_size until we call bpf_btf_get_info_by_fd(). so * let's start with a sane default - 4KiB here - and resize it only if - * bpf_obj_get_info_by_fd() needs a bigger buffer. + * bpf_btf_get_info_by_fd() needs a bigger buffer. */ last_size = 4096; ptr = malloc(last_size); @@ -1473,7 +1622,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) memset(&btf_info, 0, sizeof(btf_info)); btf_info.btf = ptr_to_u64(ptr); btf_info.btf_size = last_size; - err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); if (!err && btf_info.btf_size > last_size) { void *temp_ptr; @@ -1491,7 +1640,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) btf_info.btf = ptr_to_u64(ptr); btf_info.btf_size = last_size; - err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); } if (err || btf_info.btf_size > last_size) { diff --git a/src/btf_dump.c b/src/btf_dump.c index 580985e..4d9f30b 100644 --- a/src/btf_dump.c +++ b/src/btf_dump.c @@ -2250,9 +2250,25 @@ static int btf_dump_type_data_check_overflow(struct btf_dump *d, const struct btf_type *t, __u32 id, const void *data, - __u8 bits_offset) + __u8 bits_offset, + __u8 bit_sz) { - __s64 size = btf__resolve_size(d->btf, id); + __s64 size; + + if (bit_sz) { + /* bits_offset is at most 7. bit_sz is at most 128. */ + __u8 nr_bytes = (bits_offset + bit_sz + 7) / 8; + + /* When bit_sz is non zero, it is called from + * btf_dump_struct_data() where it only cares about + * negative error value. + * Return nr_bytes in success case to make it + * consistent as the regular integer case below. + */ + return data + nr_bytes > d->typed_dump->data_end ? -E2BIG : nr_bytes; + } + + size = btf__resolve_size(d->btf, id); if (size < 0 || size >= INT_MAX) { pr_warn("unexpected size [%zu] for id [%u]\n", @@ -2407,7 +2423,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d, { int size, err = 0; - size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset); + size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset, bit_sz); if (size < 0) return size; err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz); diff --git a/src/elf.c b/src/elf.c new file mode 100644 index 0000000..7412055 --- /dev/null +++ b/src/elf.c @@ -0,0 +1,635 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifdef HAVE_LIBELF +#include +#include +#endif + +#include +#include + +#include "libbpf_internal.h" +#include "str_error.h" + + + +#define STRERR_BUFSIZE 128 + +/* A SHT_GNU_versym section holds 16-bit words. This bit is set if + * the symbol is hidden and can only be seen when referenced using an + * explicit version number. This is a GNU extension. + */ +#define VERSYM_HIDDEN 0x8000 + +/* This is the mask for the rest of the data in a word read from a + * SHT_GNU_versym section. + */ +#define VERSYM_VERSION 0x7fff + + +#ifdef HAVE_LIBELF +int elf_open(const char *binary_path, struct elf_fd *elf_fd) +{ + char errmsg[STRERR_BUFSIZE]; + int fd, ret; + Elf *elf; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("elf: failed to init libelf for %s\n", binary_path); + return -LIBBPF_ERRNO__LIBELF; + } + fd = open(binary_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = -errno; + pr_warn("elf: failed to open %s: %s\n", binary_path, + libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + return ret; + } + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + close(fd); + return -LIBBPF_ERRNO__FORMAT; + } + elf_fd->fd = fd; + elf_fd->elf = elf; + return 0; +} +#elif HAVE_ELFIO +int elf_open(const char *binary_path, struct elf_fd *elf_fd) +{ + pelfio_t pelfio = elfio_new(); + bool ret = false; + ret = elfio_load(pelfio, binary_path); + if (!ret) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + return -LIBBPF_ERRNO__FORMAT; + } + elf_fd->elf = pelfio; + elf_fd->fd = -1; + return 0; +} +#endif + + +void elf_close(struct elf_fd *elf_fd) +{ + if (!elf_fd) + return; +#ifdef HAVE_LIBELF + elf_end(elf_fd->elf); + close(elf_fd->fd); +#elif HAVE_ELFIO + elfio_delete(elf_fd->elf); +#endif +} + +/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ +#ifdef HAVE_LIBELF +static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) +{ + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + + if (!gelf_getshdr(scn, &sh)) + continue; + if (sh.sh_type == sh_type) + return scn; + } + return NULL; +} +#elif HAVE_ELFIO +static psection_t elf_find_next_scn_by_type(pelfio_t pelfio, int sh_type, psection_t pscn) +{ + int secno = elfio_get_sections_num(pelfio); + int j = 0; + if (pscn != NULL) { + for (int i = 0; i < secno; i++) { + psection_t psection = elfio_get_section_by_index(pelfio, i); + if (psection == pscn) { + j = i; + } + } + } + for (; j < secno; j++) { + psection_t psection = elfio_get_section_by_index(pelfio, j); + Elf_Word sec_type = elfio_section_get_type(psection); + if (sec_type == sh_type) { + return psection; + } + } + return NULL; +} +#endif + +static int elf_sym_iter_new(struct elf_sym_iter *iter,pelfio_t elf, const char *binary_path, int sh_type, int st_type) +{ + psection_t pSec = NULL; + memset(iter, 0, sizeof(*iter)); + pSec = elf_find_next_scn_by_type(elf, sh_type, NULL); + iter->strtabidx = elfio_section_get_link(pSec); + iter->syms->d_buf = (void*)elfio_section_get_data(pSec); + iter->syms->d_size = elfio_section_get_size(pSec); + if (!iter->syms->d_buf) { + pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", + binary_path, elf_errmsg(-1)); + return -EINVAL; + } + iter->nr_syms = iter->syms->d_size / elfio_section_get_entry_size(pSec); + iter->symsSec = pSec; + iter->elf = elf; + iter->st_type = st_type; + /* Version symbol table is meaningful to dynsym only */ + if (sh_type != SHT_DYNSYM) + return 0; + pSec = elf_find_next_scn_by_type(elf, SHT_GNU_versym, NULL); + if (!pSec) { + return 0; + } + iter->versyms->d_buf = (void*)elfio_section_get_data(pSec); + iter->versyms->d_size = elfio_section_get_size(pSec); + pSec = elf_find_next_scn_by_type(elf, SHT_GNU_verdef, NULL); + if (!pSec) { + return 0; + } + iter->verdefs->d_buf = (void*)elfio_section_get_data(pSec); + iter->verdefs->d_size = elfio_section_get_size(pSec); + if (!iter->verdefs->d_buf) { + pr_warn("elf: failed to get verdef ELF section in '%s'\n", binary_path); + return -EINVAL; + } + iter->verdef_strtabidx = elfio_section_get_link(pSec); + return 0; +} + +#ifdef HAVA_ELFIO +static GElf_Shdr *elf_sec_hdr_by_idx(const pelfio_t elf, size_t idx, GElf_Shdr *sheader) +{ + psection_t psection = elfio_get_section_by_index(elf, idx); + sheader->sh_name = elfio_section_get_name_string_offset(psection); + sheader->sh_type = elfio_section_get_type(psection); + sheader->sh_flags = elfio_section_get_flags(psection); + sheader->sh_addr = elfio_section_get_address(psection); + sheader->sh_offset = elfio_section_get_offset(psection); + sheader->sh_size = elfio_section_get_size(psection); + sheader->sh_link = elfio_section_get_link(psection); + sheader->sh_info = elfio_section_get_info(psection); + sheader->sh_addralign = elfio_section_get_addr_align(psection); + sheader->sh_entsize = elfio_section_get_entry_size(psection); + return sheader; +} +#endif //HAVA_ELFIO + + +static struct elf_sym *elf_sym_iter_next(struct elf_sym_iter *iter) +{ + struct elf_sym *ret = &iter->sym; + GElf_Sym *sym = &ret->sym; + GElf_Versym versym; +#ifdef HAVA_LIBELF + Elf_Scn *sym_scn; +#elif HAVE_ELFIO + psection_t sym_scn; +#endif + const char *name = NULL; + size_t idx; + for (idx = iter->next_sym_idx; idx < iter->nr_syms; idx++) { +#ifdef HAVA_LIBELF + if (!gelf_getsym(iter->syms, idx, sym)) + continue; + if (GELF_ST_TYPE(sym->st_info) != iter->st_type) + continue; + name = elf_strptr(iter->elf, iter->strtabidx, sym->st_name); + if (!name) + continue; + sym_scn = elf_getscn(iter->elf, sym->st_shndx); + if (!sym_scn) + continue; + if (!gelf_getshdr(sym_scn, &ret->sh)) + continue; +#elif HAVA_ELFIO + if(memcpy(sym,iter->sysms->d_buf + idx,sizeof(GElf_Sym) == NULL) { + continue; + } + if(((sym->st_info) & 0xf) != iter->st_type) { + continue; + } + psection_t psection = elfio_get_section_by_index(iter->elf, iter->strtabidx); + if (!psection) + return -LIBBPF_ERRNO__FORMAT; + pstring_t strstring = elfio_string_section_accessor_new(psection); + name = elfio_string_get_string(strstring, sym->st_name); + if(!name) { + continue; + } + if(!elf_sec_hdr_by_idx(iter->elf, sym->st_shndx, &ret->sh)) { + continue; + } +#endif + iter->next_sym_idx = idx + 1; + ret->name = name; + ret->ver = 0; + ret->hidden = false; + if (iter->versyms) { +#ifdef HAVA_LIBELF + if (!gelf_getversym(iter->versyms, idx, &versym)) + continue; +#elif HAVA_ELFIO + versym = (GElf_Versym)iter->versysm->d_buf[idx]; +#endif + ret->ver = versym & VERSYM_VERSION; + ret->hidden = versym & VERSYM_HIDDEN; + } + return ret; + } + return NULL; +} + +static const char *elf_get_vername(struct elf_sym_iter *iter, int ver) +{ + GElf_Verdaux verdaux; + GElf_Verdef verdef; + int offset; + if (!iter->verdefs) + return NULL; + offset = 0; +#ifdef HAVE_LIBELF + while (gelf_getverdef(iter->verdefs, offset, &verdef)) { + if (verdef.vd_ndx != ver) { + if (!verdef.vd_next) + break; + offset += verdef.vd_next; + continue; + } + if (!gelf_getverdaux(iter->verdefs, offset + verdef.vd_aux, &verdaux)) + break; + return elf_strptr(iter->elf, iter->verdef_strtabidx, verdaux.vda_name); + } +#elif HAVE_ELFIO + while (memcpy(&verdef, (void *)iter->verdefs->d_buf + offset, sizeof(GElf_Verdef)) != NULL) { + if (verdef.vd_ndx != ver) { + if (!verdef.vd_next) + break; + offset += verdef.vd_next; + continue; + } + if(memcpy(&verdaux, (void *)iter->verdefs->d_buf + offset + verdef.vd_aux, sizeof(GElf_Verdaux)) == NULL) { + break; + } + psection_t psection = elfio_get_section_by_index(iter->elf, iter->verdef_strtabidx); + if (!psection) + return NULL; + pstring_t strstring = elfio_string_section_accessor_new(psection); + return elfio_string_get_string(strstring, verdaux.vda_name); + } +#endif +return NULL; +} + +static bool symbol_match(struct elf_sym_iter *iter, int sh_type, struct elf_sym *sym, + const char *name, size_t name_len, const char *lib_ver) +{ + const char *ver_name; + + /* Symbols are in forms of func, func@LIB_VER or func@@LIB_VER + * make sure the func part matches the user specified name + */ + if (strncmp(sym->name, name, name_len) != 0) + return false; + + /* ...but we don't want a search for "foo" to match 'foo2" also, so any + * additional characters in sname should be of the form "@@LIB". + */ + if (sym->name[name_len] != '\0' && sym->name[name_len] != '@') + return false; + + /* If user does not specify symbol version, then we got a match */ + if (!lib_ver) + return true; + + /* If user specifies symbol version, for dynamic symbols, + * get version name from ELF verdef section for comparison. + */ + if (sh_type == SHT_DYNSYM) { + ver_name = elf_get_vername(iter, sym->ver); + if (!ver_name) + return false; + return strcmp(ver_name, lib_ver) == 0; + } + + /* For normal symbols, it is already in form of func@LIB_VER */ + return strcmp(sym->name, name) == 0; +} + +/* Transform symbol's virtual address (absolute for binaries and relative + * for shared libs) into file offset, which is what kernel is expecting + * for uprobe/uretprobe attachment. + * See Documentation/trace/uprobetracer.rst for more details. This is done + * by looking up symbol's containing section's header and using iter's virtual + * address (sh_addr) and corresponding file offset (sh_offset) to transform + * sym.st_value (virtual address) into desired final file offset. + */ +static unsigned long elf_sym_offset(struct elf_sym *sym) +{ + return sym->sym.st_value - sym->sh.sh_addr + sym->sh.sh_offset; +} + +/* Find offset of function name in the provided ELF object. "binary_path" is + * the path to the ELF binary represented by "elf", and only used for error + * reporting matters. "name" matches symbol name or name@@LIB for library + * functions. + */ +#ifdef HAVE_LIBELF +long elf_find_func_offset(Elf *elf, const char *binary_path, const char *name) +#elif HAVE_ELFIO +long elf_find_func_offset(pelfio_t elf, const char *binary_path, const char *name) +#endif +{ + int i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + const char *at_symbol, *lib_ver; + bool is_shared_lib; + long ret = -ENOENT; + size_t name_len; +#ifdef HAVE_LIBELF + GElf_Ehdr ehdr; + + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + + /* for shared lib case, we do not need to calculate relative offset */ + is_shared_lib = ehdr.e_type == ET_DYN; +#elif HAVA_ELFIO + is_shared_lib = (ET_DYN == elfio_get_type(pelfio)); +#endif + /* Does name specify "@@LIB_VER" or "@LIB_VER" ? */ + at_symbol = strchr(name, '@'); + if (at_symbol) { + name_len = at_symbol - name; + /* skip second @ if it's @@LIB_VER case */ + if (at_symbol[1] == '@') + at_symbol++; + lib_ver = at_symbol + 1; + } else { + name_len = strlen(name); + lib_ver = NULL; + } + + /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if + * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically + * linked binary may not have SHT_DYMSYM, so absence of a section should not be + * reported as a warning/error. + */ + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + struct elf_sym_iter iter; + struct elf_sym *sym; + int last_bind = -1; + int cur_bind; + + ret = elf_sym_iter_new(&iter, elf, binary_path, sh_types[i], STT_FUNC); + if (ret == -ENOENT) + continue; + if (ret) + goto out; + + while ((sym = elf_sym_iter_next(&iter))) { + if (!symbol_match(&iter, sh_types[i], sym, name, name_len, lib_ver)) + continue; + + cur_bind = GELF_ST_BIND(sym->sym.st_info); + + if (ret > 0) { + /* handle multiple matches */ + if (elf_sym_offset(sym) == ret) { + /* same offset, no problem */ + continue; + } else if (last_bind != STB_WEAK && cur_bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", + sym->name, name, binary_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } else if (cur_bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } + + ret = elf_sym_offset(sym); + last_bind = cur_bind; + } + if (ret > 0) + break; + } + + if (ret > 0) { + pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, + ret); + } else { + if (ret == 0) { + pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, + is_shared_lib ? "should not be 0 in a shared library" : + "try using shared library path instead"); + ret = -ENOENT; + } else { + pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); + } + } +out: + return ret; +} + +/* Find offset of function name in ELF object specified by path. "name" matches + * symbol name or name@@LIB for library functions. + */ +long elf_find_func_offset_from_file(const char *binary_path, const char *name) +{ + struct elf_fd elf_fd; + long ret = -ENOENT; + + ret = elf_open(binary_path, &elf_fd); + if (ret) + return ret; + ret = elf_find_func_offset(elf_fd.elf, binary_path, name); + elf_close(&elf_fd); + return ret; +} + +struct symbol { + const char *name; + int bind; + int idx; +}; + +static int symbol_cmp(const void *a, const void *b) +{ + const struct symbol *sym_a = a; + const struct symbol *sym_b = b; + + return strcmp(sym_a->name, sym_b->name); +} + +/* + * Return offsets in @poffsets for symbols specified in @syms array argument. + * On success returns 0 and offsets are returned in allocated array with @cnt + * size, that needs to be released by the caller. + */ +int elf_resolve_syms_offsets(const char *binary_path, int cnt, + const char **syms, unsigned long **poffsets) +{ + int sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + int err = 0, i, cnt_done = 0; + unsigned long *offsets; + struct symbol *symbols; + struct elf_fd elf_fd; + + err = elf_open(binary_path, &elf_fd); + if (err) + return err; + + offsets = calloc(cnt, sizeof(*offsets)); + symbols = calloc(cnt, sizeof(*symbols)); + + if (!offsets || !symbols) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < cnt; i++) { + symbols[i].name = syms[i]; + symbols[i].idx = i; + } + + qsort(symbols, cnt, sizeof(*symbols), symbol_cmp); + + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + struct elf_sym_iter iter; + struct elf_sym *sym; + + err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], STT_FUNC); + if (err == -ENOENT) + continue; + if (err) + goto out; + + while ((sym = elf_sym_iter_next(&iter))) { + unsigned long sym_offset = elf_sym_offset(sym); + int bind = GELF_ST_BIND(sym->sym.st_info); + struct symbol *found, tmp = { + .name = sym->name, + }; + unsigned long *offset; + + found = bsearch(&tmp, symbols, cnt, sizeof(*symbols), symbol_cmp); + if (!found) + continue; + + offset = &offsets[found->idx]; + if (*offset > 0) { + /* same offset, no problem */ + if (*offset == sym_offset) + continue; + /* handle multiple matches */ + if (found->bind != STB_WEAK && bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match found '%s@%lu' in '%s' previous offset %lu\n", + sym->name, sym_offset, binary_path, *offset); + err = -ESRCH; + goto out; + } else if (bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } else { + cnt_done++; + } + *offset = sym_offset; + found->bind = bind; + } + } + + if (cnt != cnt_done) { + err = -ENOENT; + goto out; + } + + *poffsets = offsets; + +out: + free(symbols); + if (err) + free(offsets); + elf_close(&elf_fd); + return err; +} + +/* + * Return offsets in @poffsets for symbols specified by @pattern argument. + * On success returns 0 and offsets are returned in allocated @poffsets + * array with the @pctn size, that needs to be released by the caller. + */ +int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, + unsigned long **poffsets, size_t *pcnt) +{ + int sh_types[2] = { SHT_SYMTAB, SHT_DYNSYM }; + unsigned long *offsets = NULL; + size_t cap = 0, cnt = 0; + struct elf_fd elf_fd; + int err = 0, i; + + err = elf_open(binary_path, &elf_fd); + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + struct elf_sym_iter iter; + struct elf_sym *sym; + + err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], STT_FUNC); + if (err == -ENOENT) + continue; + if (err) + goto out; + + while ((sym = elf_sym_iter_next(&iter))) { + if (!glob_match(sym->name, pattern)) + continue; + + err = libbpf_ensure_mem((void **) &offsets, &cap, sizeof(*offsets), + cnt + 1); + if (err) + goto out; + + offsets[cnt++] = elf_sym_offset(sym); + } + + /* If we found anything in the first symbol section, + * do not search others to avoid duplicates. + */ + if (cnt) + break; + } + + if (cnt) { + *poffsets = offsets; + *pcnt = cnt; + } else { + err = -ENOENT; + } + +out: + if (err) + free(offsets); + elf_close(&elf_fd); + return err; +} diff --git a/src/gen_loader.c b/src/gen_loader.c index 23f5c46..cf3323f 100644 --- a/src/gen_loader.c +++ b/src/gen_loader.c @@ -560,7 +560,7 @@ static void emit_find_attach_target(struct bpf_gen *gen) } void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, - bool is_typeless, int kind, int insn_idx) + bool is_typeless, bool is_ld64, int kind, int insn_idx) { struct ksym_relo_desc *relo; @@ -574,6 +574,7 @@ void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, relo->name = name; relo->is_weak = is_weak; relo->is_typeless = is_typeless; + relo->is_ld64 = is_ld64; relo->kind = kind; relo->insn_idx = insn_idx; gen->relo_cnt++; @@ -586,9 +587,11 @@ static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_des int i; for (i = 0; i < gen->nr_ksyms; i++) { - if (!strcmp(gen->ksyms[i].name, relo->name)) { - gen->ksyms[i].ref++; - return &gen->ksyms[i]; + kdesc = &gen->ksyms[i]; + if (kdesc->kind == relo->kind && kdesc->is_ld64 == relo->is_ld64 && + !strcmp(kdesc->name, relo->name)) { + kdesc->ref++; + return kdesc; } } kdesc = libbpf_reallocarray(gen->ksyms, gen->nr_ksyms + 1, sizeof(*kdesc)); @@ -603,6 +606,7 @@ static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_des kdesc->ref = 1; kdesc->off = 0; kdesc->insn = 0; + kdesc->is_ld64 = relo->is_ld64; return kdesc; } @@ -699,17 +703,17 @@ static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo /* obtain fd in BPF_REG_9 */ emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); - /* jump to fd_array store if fd denotes module BTF */ + /* load fd_array slot pointer */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); + /* store BTF fd in slot, 0 for vmlinux */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); + /* jump to insn[insn_idx].off store if fd denotes module BTF */ emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); /* set the default value for off */ emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); /* skip BTF fd store for vmlinux BTF */ - emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); - /* load fd_array slot pointer */ - emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, - 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); - /* store BTF fd in slot */ - emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); /* store index into insn[insn_idx].off */ emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx)); log: @@ -804,11 +808,13 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, return; /* try to copy from existing ldimm64 insn */ if (kdesc->ref > 1) { - move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, - kdesc->insn + offsetof(struct bpf_insn, imm)); move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); - /* jump over src_reg adjustment if imm is not 0, reuse BPF_REG_0 from move_blob2blob */ + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + /* jump over src_reg adjustment if imm (btf_id) is not 0, reuse BPF_REG_0 from move_blob2blob + * If btf_id is zero, clear BPF_PSEUDO_BTF_ID flag in src_reg of ld_imm64 insn + */ emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3)); goto clear_src_reg; } @@ -831,7 +837,7 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); /* skip src_reg adjustment */ - emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 3)); clear_src_reg: /* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */ reg_mask = src_reg_mask(); @@ -862,23 +868,17 @@ static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn { int insn; - pr_debug("gen: emit_relo (%d): %s at %d\n", relo->kind, relo->name, relo->insn_idx); + pr_debug("gen: emit_relo (%d): %s at %d %s\n", + relo->kind, relo->name, relo->insn_idx, relo->is_ld64 ? "ld64" : "call"); insn = insns + sizeof(struct bpf_insn) * relo->insn_idx; emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_8, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, insn)); - switch (relo->kind) { - case BTF_KIND_VAR: + if (relo->is_ld64) { if (relo->is_typeless) emit_relo_ksym_typeless(gen, relo, insn); else emit_relo_ksym_btf(gen, relo, insn); - break; - case BTF_KIND_FUNC: + } else { emit_relo_kfunc_btf(gen, relo, insn); - break; - default: - pr_warn("Unknown relocation kind '%d'\n", relo->kind); - gen->error = -EDOM; - return; } } @@ -901,18 +901,20 @@ static void cleanup_core_relo(struct bpf_gen *gen) static void cleanup_relos(struct bpf_gen *gen, int insns) { + struct ksym_desc *kdesc; int i, insn; for (i = 0; i < gen->nr_ksyms; i++) { + kdesc = &gen->ksyms[i]; /* only close fds for typed ksyms and kfuncs */ - if (gen->ksyms[i].kind == BTF_KIND_VAR && !gen->ksyms[i].typeless) { + if (kdesc->is_ld64 && !kdesc->typeless) { /* close fd recorded in insn[insn_idx + 1].imm */ - insn = gen->ksyms[i].insn; + insn = kdesc->insn; insn += sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm); emit_sys_close_blob(gen, insn); - } else if (gen->ksyms[i].kind == BTF_KIND_FUNC) { - emit_sys_close_blob(gen, blob_fd_array_off(gen, gen->ksyms[i].off)); - if (gen->ksyms[i].off < MAX_FD_ARRAY_SZ) + } else if (!kdesc->is_ld64) { + emit_sys_close_blob(gen, blob_fd_array_off(gen, kdesc->off)); + if (kdesc->off < MAX_FD_ARRAY_SZ) gen->nr_fd_array--; } } diff --git a/src/hashmap.h b/src/hashmap.h index 0a5bf19..c12f832 100644 --- a/src/hashmap.h +++ b/src/hashmap.h @@ -80,16 +80,6 @@ struct hashmap { size_t sz; }; -#define HASHMAP_INIT(hash_fn, equal_fn, ctx) { \ - .hash_fn = (hash_fn), \ - .equal_fn = (equal_fn), \ - .ctx = (ctx), \ - .buckets = NULL, \ - .cap = 0, \ - .cap_bits = 0, \ - .sz = 0, \ -} - void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, hashmap_equal_fn equal_fn, void *ctx); struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, diff --git a/src/libbpf.c b/src/libbpf.c index b2fb4bf..af7203f 100644 --- a/src/libbpf.c +++ b/src/libbpf.c @@ -57,28 +57,7 @@ #include "libbpf_internal.h" #include "hashmap.h" #include "bpf_gen_internal.h" - -#ifdef HAVE_ELFIO -#include "elfio_c_wrapper.h" -#include -#include -#include - -typedef struct Elf64_Ehdr Elf64_Ehdr; -typedef struct Elf64_Shdr Elf64_Shdr; -typedef struct Elf64_Sym Elf64_Sym; -typedef struct Elf64_Rel Elf64_Rel; -typedef struct { - void *d_buf; - size_t d_size; -} Elf_Data; - -#define ELF64_ST_TYPE(val) ELF_ST_TYPE (val) -#define ELF64_ST_BIND(val) ELF_ST_BIND (val) -#define elf_errmsg(val) "error" - -#endif - +#include "zip.h" #ifndef BPF_FS_MAGIC #define BPF_FS_MAGIC 0xcafe4a11 @@ -107,17 +86,22 @@ static const char * const attach_type_name[] = { [BPF_CGROUP_INET6_BIND] = "cgroup_inet6_bind", [BPF_CGROUP_INET4_CONNECT] = "cgroup_inet4_connect", [BPF_CGROUP_INET6_CONNECT] = "cgroup_inet6_connect", + [BPF_CGROUP_UNIX_CONNECT] = "cgroup_unix_connect", [BPF_CGROUP_INET4_POST_BIND] = "cgroup_inet4_post_bind", [BPF_CGROUP_INET6_POST_BIND] = "cgroup_inet6_post_bind", [BPF_CGROUP_INET4_GETPEERNAME] = "cgroup_inet4_getpeername", [BPF_CGROUP_INET6_GETPEERNAME] = "cgroup_inet6_getpeername", + [BPF_CGROUP_UNIX_GETPEERNAME] = "cgroup_unix_getpeername", [BPF_CGROUP_INET4_GETSOCKNAME] = "cgroup_inet4_getsockname", [BPF_CGROUP_INET6_GETSOCKNAME] = "cgroup_inet6_getsockname", + [BPF_CGROUP_UNIX_GETSOCKNAME] = "cgroup_unix_getsockname", [BPF_CGROUP_UDP4_SENDMSG] = "cgroup_udp4_sendmsg", [BPF_CGROUP_UDP6_SENDMSG] = "cgroup_udp6_sendmsg", + [BPF_CGROUP_UNIX_SENDMSG] = "cgroup_unix_sendmsg", [BPF_CGROUP_SYSCTL] = "cgroup_sysctl", [BPF_CGROUP_UDP4_RECVMSG] = "cgroup_udp4_recvmsg", [BPF_CGROUP_UDP6_RECVMSG] = "cgroup_udp6_recvmsg", + [BPF_CGROUP_UNIX_RECVMSG] = "cgroup_unix_recvmsg", [BPF_CGROUP_GETSOCKOPT] = "cgroup_getsockopt", [BPF_CGROUP_SETSOCKOPT] = "cgroup_setsockopt", [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", @@ -141,6 +125,13 @@ static const char * const attach_type_name[] = { [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", [BPF_PERF_EVENT] = "perf_event", [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", + [BPF_STRUCT_OPS] = "struct_ops", + [BPF_NETFILTER] = "netfilter", + [BPF_TCX_INGRESS] = "tcx_ingress", + [BPF_TCX_EGRESS] = "tcx_egress", + [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", + [BPF_NETKIT_PRIMARY] = "netkit_primary", + [BPF_NETKIT_PEER] = "netkit_peer", }; static const char * const link_type_name[] = { @@ -154,6 +145,10 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_LINK_TYPE_NETFILTER] = "netfilter", + [BPF_LINK_TYPE_TCX] = "tcx", + [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", + [BPF_LINK_TYPE_NETKIT] = "netkit", }; static const char * const map_type_name[] = { @@ -225,6 +220,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SYSCALL] = "syscall", + [BPF_PROG_TYPE_NETFILTER] = "netfilter", }; static int __base_pr(enum libbpf_print_level level, const char *format, @@ -240,9 +236,10 @@ static libbpf_print_fn_t __libbpf_pr = __base_pr; libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn) { - libbpf_print_fn_t old_print_fn = __libbpf_pr; + libbpf_print_fn_t old_print_fn; + + old_print_fn = __atomic_exchange_n(&__libbpf_pr, fn, __ATOMIC_RELAXED); - __libbpf_pr = fn; return old_print_fn; } @@ -251,8 +248,10 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...) { va_list args; int old_errno; + libbpf_print_fn_t print_fn; - if (!__libbpf_pr) + print_fn = __atomic_load_n(&__libbpf_pr, __ATOMIC_RELAXED); + if (!print_fn) return; old_errno = errno; @@ -340,8 +339,8 @@ enum reloc_type { RELO_LD64, RELO_CALL, RELO_DATA, - RELO_EXTERN_VAR, - RELO_EXTERN_FUNC, + RELO_EXTERN_LD64, + RELO_EXTERN_CALL, RELO_SUBPROG_ADDR, RELO_CORE, }; @@ -354,6 +353,7 @@ struct reloc_desc { struct { int map_idx; int sym_off; + int ext_idx; }; }; }; @@ -379,6 +379,8 @@ enum sec_def_flags { SEC_SLEEPABLE = 8, /* BPF program support non-linear XDP buffer */ SEC_XDP_FRAGS = 16, + /* Setup proper attach type for usdt probes. */ + SEC_USDT = 32, }; struct bpf_sec_def { @@ -446,9 +448,11 @@ struct bpf_program { int fd; bool autoload; bool autoattach; + bool sym_global; bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; + int exception_cb_idx; int prog_ifindex; __u32 attach_btf_obj_fd; @@ -492,6 +496,7 @@ struct bpf_struct_ops { #define KCONFIG_SEC ".kconfig" #define KSYMS_SEC ".ksyms" #define STRUCT_OPS_SEC ".struct_ops" +#define STRUCT_OPS_LINK_SEC ".struct_ops.link" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, @@ -563,6 +568,7 @@ struct extern_desc { int btf_id; int sec_btf_id; const char *name; + char *essent_name; bool is_set; bool is_weak; union { @@ -635,6 +641,7 @@ struct elf_state { Elf64_Ehdr *ehdr; Elf_Data *symbols; Elf_Data *st_ops_data; + Elf_Data *st_ops_link_data; size_t shstrndx; /* section index for section name strings */ size_t strtabidx; struct elf_sec_desc *secs; @@ -644,6 +651,7 @@ struct elf_state { int text_shndx; int symbols_shndx; int st_ops_shndx; + int st_ops_link_shndx; }; struct usdt_manager; @@ -800,6 +808,7 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->type = BPF_PROG_TYPE_UNSPEC; prog->fd = -1; + prog->exception_cb_idx = -1; /* libbpf's convention for SEC("?abc...") is that it's just like * SEC("abc...") but the corresponding bpf_program starts out with @@ -853,7 +862,6 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, progs = obj->programs; nr_progs = obj->nr_programs; nr_syms = symbols->d_size / sizeof(Elf64_Sym); - sec_off = 0; for (i = 0; i < nr_syms; i++) { sym = elf_sym_by_idx(obj, i); @@ -907,14 +915,16 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, if (err) return err; + if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL) + prog->sym_global = true; + /* if function is a global/weak symbol, but has restricted * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF FUNC * as static to enable more permissive BPF verification mode * with more outside context available to BPF verifier */ - if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL - && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN - || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) + if (prog->sym_global && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) prog->mark_btf_static = true; nr_progs++; @@ -924,42 +934,6 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, return 0; } -__u32 get_kernel_version(void) -{ - /* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, - * but Ubuntu provides /proc/version_signature file, as described at - * https://ubuntu.com/kernel, with an example contents below, which we - * can use to get a proper LINUX_VERSION_CODE. - * - * Ubuntu 5.4.0-12.15-generic 5.4.8 - * - * In the above, 5.4.8 is what kernel is actually expecting, while - * uname() call will return 5.4.0 in info.release. - */ - const char *ubuntu_kver_file = "/proc/version_signature"; - __u32 major, minor, patch; - struct utsname info; - - if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) == 0) { - FILE *f; - - f = fopen(ubuntu_kver_file, "r"); - if (f) { - if (fscanf(f, "%*s %*s %d.%d.%d\n", &major, &minor, &patch) == 3) { - fclose(f); - return KERNEL_VERSION(major, minor, patch); - } - fclose(f); - } - /* something went wrong, fall back to uname() approach */ - } - - uname(&info); - if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) - return 0; - return KERNEL_VERSION(major, minor, patch); -} - static const struct btf_member * find_member_by_offset(const struct btf_type *t, __u32 bit_offset) { @@ -1209,7 +1183,8 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) return 0; } -static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) +static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, + int shndx, Elf_Data *data, __u32 map_flags) { const struct btf_type *type, *datasec; const struct btf_var_secinfo *vsi; @@ -1220,15 +1195,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) struct bpf_map *map; __u32 i; - if (obj->efile.st_ops_shndx == -1) + if (shndx == -1) return 0; btf = obj->btf; - datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC, + datasec_id = btf__find_by_name_kind(btf, sec_name, BTF_KIND_DATASEC); if (datasec_id < 0) { pr_warn("struct_ops init: DATASEC %s not found\n", - STRUCT_OPS_SEC); + sec_name); return -EINVAL; } @@ -1241,7 +1216,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) type_id = btf__resolve_type(obj->btf, vsi->type); if (type_id < 0) { pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n", - vsi->type, STRUCT_OPS_SEC); + vsi->type, sec_name); return -EINVAL; } @@ -1260,7 +1235,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) if (IS_ERR(map)) return PTR_ERR(map); - map->sec_idx = obj->efile.st_ops_shndx; + map->sec_idx = shndx; map->sec_offset = vsi->offset; map->name = strdup(var_name); if (!map->name) @@ -1270,6 +1245,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) map->def.key_size = sizeof(int); map->def.value_size = type->size; map->def.max_entries = 1; + map->def.map_flags = map_flags; map->st_ops = calloc(1, sizeof(*map->st_ops)); if (!map->st_ops) @@ -1282,14 +1258,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off) return -ENOMEM; - if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) { + if (vsi->offset + type->size > data->d_size) { pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n", - var_name, STRUCT_OPS_SEC); + var_name, sec_name); return -EINVAL; } memcpy(st_ops->data, - obj->efile.st_ops_data->d_buf + vsi->offset, + data->d_buf + vsi->offset, type->size); st_ops->tname = tname; st_ops->type = type; @@ -1302,6 +1278,19 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) return 0; } +static int bpf_object_init_struct_ops(struct bpf_object *obj) +{ + int err; + + err = init_struct_ops_maps(obj, STRUCT_OPS_SEC, obj->efile.st_ops_shndx, + obj->efile.st_ops_data, 0); + err = err ?: init_struct_ops_maps(obj, STRUCT_OPS_LINK_SEC, + obj->efile.st_ops_link_shndx, + obj->efile.st_ops_link_data, + BPF_F_LINK); + return err; +} + static struct bpf_object *bpf_object__new(const char *path, const void *obj_buf, size_t obj_buf_sz, @@ -1338,6 +1327,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.obj_buf_sz = obj_buf_sz; obj->efile.btf_maps_shndx = -1; obj->efile.st_ops_shndx = -1; + obj->efile.st_ops_link_shndx = -1; obj->kconfig_map_idx = -1; obj->kern_version = get_kernel_version(); @@ -1364,6 +1354,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.elf = NULL; obj->efile.symbols = NULL; obj->efile.st_ops_data = NULL; + obj->efile.st_ops_link_data = NULL; zfree(&obj->efile.secs); obj->efile.sec_cnt = 0; @@ -1604,16 +1595,36 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) return map; } -static size_t bpf_map_mmap_sz(const struct bpf_map *map) +static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) { - long page_sz = sysconf(_SC_PAGE_SIZE); + const long page_sz = sysconf(_SC_PAGE_SIZE); size_t map_sz; - map_sz = (size_t)roundup(map->def.value_size, 8) * map->def.max_entries; + map_sz = (size_t)roundup(value_sz, 8) * max_entries; map_sz = roundup(map_sz, page_sz); return map_sz; } +static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz) +{ + void *mmaped; + + if (!map->mmaped) + return -EINVAL; + + if (old_sz == new_sz) + return 0; + + mmaped = mmap(NULL, new_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmaped == MAP_FAILED) + return -errno; + + memcpy(mmaped, map->mmaped, min(old_sz, new_sz)); + munmap(map->mmaped, old_sz); + map->mmaped = mmaped; + return 0; +} + static char *internal_map_name(struct bpf_object *obj, const char *real_name) { char map_name[BPF_OBJ_NAME_LEN], *p; @@ -1712,6 +1723,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, { struct bpf_map_def *def; struct bpf_map *map; + size_t mmap_sz; int err; map = bpf_object__add_map(obj); @@ -1746,7 +1758,8 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", map->name, map->sec_idx, map->sec_offset, def->map_flags); - map->mmaped = mmap(NULL, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, + mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (map->mmaped == MAP_FAILED) { err = -errno; @@ -2068,9 +2081,9 @@ static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data) return -ENAMETOOLONG; /* gzopen also accepts uncompressed files. */ - file = gzopen(buf, "r"); + file = gzopen(buf, "re"); if (!file) - file = gzopen("/proc/config.gz", "r"); + file = gzopen("/proc/config.gz", "re"); if (!file) { pr_warn("failed to open system Kconfig\n"); @@ -2765,10 +2778,10 @@ static int bpf_object__init_maps(struct bpf_object *obj, strict = !OPTS_GET(opts, relaxed_maps, false); pin_root_path = OPTS_GET(opts, pin_root_path, NULL); - err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); + err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path); err = err ?: bpf_object__init_global_data_maps(obj); err = err ?: bpf_object__init_kconfig_map(obj); - err = err ?: bpf_object__init_struct_ops_maps(obj); + err = err ?: bpf_object_init_struct_ops(obj); return err; } @@ -2906,12 +2919,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj) { return obj->efile.btf_maps_shndx >= 0 || obj->efile.st_ops_shndx >= 0 || + obj->efile.st_ops_link_shndx >= 0 || obj->nr_extern > 0; } static bool kernel_needs_btf(const struct bpf_object *obj) { - return obj->efile.st_ops_shndx >= 0; + return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0; } static int bpf_object__init_btf(struct bpf_object *obj, @@ -3251,6 +3265,86 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) } } + if (!kernel_supports(obj, FEAT_BTF_DECL_TAG)) + goto skip_exception_cb; + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + int j, k, n; + + if (prog_is_subprog(obj, prog)) + continue; + n = btf__type_cnt(obj->btf); + for (j = 1; j < n; j++) { + const char *str = "exception_callback:", *name; + size_t len = strlen(str); + struct btf_type *t; + + t = btf_type_by_id(obj->btf, j); + if (!btf_is_decl_tag(t) || btf_decl_tag(t)->component_idx != -1) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strncmp(name, str, len)) + continue; + + t = btf_type_by_id(obj->btf, t->type); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) { + pr_warn("prog '%s': exception_callback: decl tag not applied to the main program\n", + prog->name); + return -EINVAL; + } + if (strcmp(prog->name, btf__str_by_offset(obj->btf, t->name_off))) + continue; + /* Multiple callbacks are specified for the same prog, + * the verifier will eventually return an error for this + * case, hence simply skip appending a subprog. + */ + if (prog->exception_cb_idx >= 0) { + prog->exception_cb_idx = -1; + break; + } + + name += len; + if (str_is_empty(name)) { + pr_warn("prog '%s': exception_callback: decl tag contains empty value\n", + prog->name); + return -EINVAL; + } + + for (k = 0; k < obj->nr_programs; k++) { + struct bpf_program *subprog = &obj->programs[k]; + + if (!prog_is_subprog(obj, subprog)) + continue; + if (strcmp(name, subprog->name)) + continue; + /* Enforce non-hidden, as from verifier point of + * view it expects global functions, whereas the + * mark_btf_static fixes up linkage as static. + */ + if (!subprog->sym_global || subprog->mark_btf_static) { + pr_warn("prog '%s': exception callback %s must be a global non-hidden function\n", + prog->name, subprog->name); + return -EINVAL; + } + /* Let's see if we already saw a static exception callback with the same name */ + if (prog->exception_cb_idx >= 0) { + pr_warn("prog '%s': multiple subprogs with same name as exception callback '%s'\n", + prog->name, subprog->name); + return -EINVAL; + } + prog->exception_cb_idx = k; + break; + } + + if (prog->exception_cb_idx >= 0) + continue; + pr_warn("prog '%s': cannot find exception callback '%s'\n", prog->name, name); + return -ENOENT; + } + } +skip_exception_cb: + sanitize = btf_needs_sanitization(obj); if (sanitize) { const void *raw_data; @@ -4057,6 +4151,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj) struct extern_desc *ext; int i, n, off, dummy_var_btf_id; const char *ext_name, *sec_name; + size_t ext_essent_len; #ifdef HAVE_LIBELF Elf_Scn *scn; #endif @@ -4115,6 +4210,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj) ext->sym_idx = i; ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; + ext_essent_len = bpf_core_essential_name_len(ext->name); + ext->essent_name = NULL; + if (ext_essent_len != strlen(ext->name)) { + ext->essent_name = strndup(ext->name, ext_essent_len); + if (!ext->essent_name) + return -ENOMEM; + } + ext->sec_btf_id = find_extern_sec_btf_id(obj->btf, ext->btf_id); if (ext->sec_btf_id <= 0) { pr_warn("failed to find BTF for extern '%s' [%d] section: %d\n", @@ -4363,11 +4466,11 @@ static int bpf_program__record_reloc(struct bpf_program *prog, pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", prog->name, i, ext->name, ext->sym_idx, insn_idx); if (insn->code == (BPF_JMP | BPF_CALL)) - reloc_desc->type = RELO_EXTERN_FUNC; + reloc_desc->type = RELO_EXTERN_CALL; else - reloc_desc->type = RELO_EXTERN_VAR; + reloc_desc->type = RELO_EXTERN_LD64; reloc_desc->insn_idx = insn_idx; - reloc_desc->sym_off = i; /* sym_off stores extern index */ + reloc_desc->ext_idx = i; return 0; } @@ -4674,7 +4777,7 @@ static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); memset(info, 0, sizeof(*info)); - fp = fopen(file, "r"); + fp = fopen(file, "re"); if (!fp) { err = -errno; pr_warn("failed to open %s: %d. No procfs support?\n", file, @@ -4722,7 +4825,7 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) char *new_name; memset(&info, 0, len); - err = bpf_obj_get_info_by_fd(fd, &info, &len); + err = bpf_map_get_info_by_fd(fd, &info, &len); if (err && errno == EINVAL) err = bpf_get_map_info_from_fdinfo(fd, &info); if (err) @@ -4737,18 +4840,17 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) if (!new_name) return libbpf_err(-errno); - new_fd = open("/", O_RDONLY | O_CLOEXEC); + /* + * Like dup(), but make sure new FD is >= 3 and has O_CLOEXEC set. + * This is similar to what we do in ensure_good_fd(), but without + * closing original FD. + */ + new_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); if (new_fd < 0) { err = -errno; goto err_free_new_name; } - new_fd = dup3(fd, new_fd, O_CLOEXEC); - if (new_fd < 0) { - err = -errno; - goto err_close_new_fd; - } - err = zclose(map->fd); if (err) { err = -errno; @@ -5106,7 +5208,7 @@ static int probe_module_btf(void) * kernel's module BTF support coincides with support for * name/name_len fields in struct bpf_btf_info. */ - err = bpf_obj_get_info_by_fd(fd, &info, &len); + err = bpf_btf_get_info_by_fd(fd, &info, &len); close(fd); return !err; } @@ -5137,6 +5239,39 @@ static int probe_perf_link(void) return link_fd < 0 && err == -EBADF; } +static int probe_uprobe_multi_link(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, load_opts, + .expected_attach_type = BPF_TRACE_UPROBE_MULTI, + ); + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + unsigned long offset = 0; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", + insns, ARRAY_SIZE(insns), &load_opts); + if (prog_fd < 0) + return -errno; + + /* Creating uprobe in '/' binary should fail with -EBADF. */ + link_opts.uprobe_multi.path = "/"; + link_opts.uprobe_multi.offsets = &offset; + link_opts.uprobe_multi.cnt = 1; + + link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + static int probe_kern_bpf_cookie(void) { struct bpf_insn insns[] = { @@ -5233,6 +5368,9 @@ static struct kern_feature_desc { [FEAT_SYSCALL_WRAPPER] = { "Kernel using syscall wrapper", probe_kern_syscall_wrapper, }, + [FEAT_UPROBE_MULTI_LINK] = { + "BPF multi-uprobe link support", probe_uprobe_multi_link, + }, }; bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) @@ -5269,7 +5407,7 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) int err; memset(&map_info, 0, map_info_len); - err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); + err = bpf_map_get_info_by_fd(map_fd, &map_info, &map_info_len); if (err && errno == EINVAL) err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); if (err) { @@ -5795,6 +5933,10 @@ static int load_module_btfs(struct bpf_object *obj) err = bpf_btf_get_next_id(id, &id); if (err && errno == ENOENT) return 0; + if (err && errno == EPERM) { + pr_debug("skipping module BTFs loading, missing privileges\n"); + return 0; + } if (err) { err = -errno; pr_warn("failed to iterate BTF objects: %d\n", err); @@ -5815,7 +5957,7 @@ static int load_module_btfs(struct bpf_object *obj) info.name = ptr_to_u64(name); info.name_len = sizeof(name); - err = bpf_obj_get_info_by_fd(fd, &info, &len); + err = bpf_btf_get_info_by_fd(fd, &info, &len); if (err) { err = -errno; pr_warn("failed to get BTF object #%d info: %d\n", id, err); @@ -6162,8 +6304,8 @@ out: } /* base map load ldimm64 special constant, used also for log fixup logic */ -#define MAP_LDIMM64_POISON_BASE 2001000000 -#define MAP_LDIMM64_POISON_PFX "200100" +#define POISON_LDIMM64_MAP_BASE 2001000000 +#define POISON_LDIMM64_MAP_PFX "200100" static void poison_map_ldimm64(struct bpf_program *prog, int relo_idx, int insn_idx, struct bpf_insn *insn, @@ -6185,12 +6327,36 @@ static void poison_map_ldimm64(struct bpf_program *prog, int relo_idx, * invalid func unknown#2001000123 * where lower 123 is map index into obj->maps[] array */ - insn->imm = MAP_LDIMM64_POISON_BASE + map_idx; + insn->imm = POISON_LDIMM64_MAP_BASE + map_idx; insn++; } } +/* unresolved kfunc call special constant, used also for log fixup logic */ +#define POISON_CALL_KFUNC_BASE 2002000000 +#define POISON_CALL_KFUNC_PFX "2002" + +static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int ext_idx, const struct extern_desc *ext) +{ + pr_debug("prog '%s': relo #%d: poisoning insn #%d that calls kfunc '%s'\n", + prog->name, relo_idx, insn_idx, ext->name); + + /* we turn kfunc call into invalid helper call with identifiable constant */ + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is extern index into obj->externs[] array + */ + insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; +} + /* Relocate data references within program code: * - map references; * - global variable references; @@ -6235,8 +6401,8 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) relo->map_idx, map); } break; - case RELO_EXTERN_VAR: - ext = &obj->externs[relo->sym_off]; + case RELO_EXTERN_LD64: + ext = &obj->externs[relo->ext_idx]; if (ext->type == EXT_KCFG) { if (obj->gen_loader) { insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; @@ -6257,15 +6423,15 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } } break; - case RELO_EXTERN_FUNC: - ext = &obj->externs[relo->sym_off]; + case RELO_EXTERN_CALL: + ext = &obj->externs[relo->ext_idx]; insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; if (ext->is_set) { insn[0].imm = ext->ksym.kernel_btf_id; insn[0].off = ext->ksym.btf_fd_idx; - } else { /* unresolved weak kfunc */ - insn[0].imm = 0; - insn[0].off = 0; + } else { /* unresolved weak kfunc call */ + poison_kfunc_call(prog, i, relo->insn_idx, insn, + relo->ext_idx, ext); } break; case RELO_SUBPROG_ADDR: @@ -6459,7 +6625,11 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra if (main_prog == subprog) return 0; relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos)); - if (!relos) + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (!relos && new_cnt) return -ENOMEM; if (subprog->nr_reloc) memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, @@ -6475,14 +6645,46 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra return 0; } +static int +bpf_object__append_subprog_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *subprog) +{ + struct bpf_insn *insns; + size_t new_cnt; + int err; + + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + return 0; +} + static int bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, struct bpf_program *prog) { - size_t sub_insn_idx, insn_idx, new_cnt; + size_t sub_insn_idx, insn_idx; struct bpf_program *subprog; - struct bpf_insn *insns, *insn; struct reloc_desc *relo; + struct bpf_insn *insn; int err; err = reloc_prog_func_and_line_info(obj, main_prog, prog); @@ -6495,7 +6697,7 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, continue; relo = find_prog_insn_relo(prog, insn_idx); - if (relo && relo->type == RELO_EXTERN_FUNC) + if (relo && relo->type == RELO_EXTERN_CALL) /* kfunc relocations will be handled later * in bpf_object__relocate_data() */ @@ -6557,25 +6759,7 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * and relocate. */ if (subprog->sub_insn_off == 0) { - subprog->sub_insn_off = main_prog->insns_cnt; - - new_cnt = main_prog->insns_cnt + subprog->insns_cnt; - insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); - if (!insns) { - pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); - return -ENOMEM; - } - main_prog->insns = insns; - main_prog->insns_cnt = new_cnt; - - memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, - subprog->insns_cnt * sizeof(*insns)); - - pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", - main_prog->name, subprog->insns_cnt, subprog->name); - - /* The subprog insns are now appended. Append its relos too. */ - err = append_subprog_relos(main_prog, subprog); + err = bpf_object__append_subprog_code(obj, main_prog, subprog); if (err) return err; err = bpf_object__reloc_code(obj, main_prog, subprog); @@ -6809,6 +6993,25 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) prog->name, err); return err; } + + /* Now, also append exception callback if it has not been done already. */ + if (prog->exception_cb_idx >= 0) { + struct bpf_program *subprog = &obj->programs[prog->exception_cb_idx]; + + /* Calling exception callback directly is disallowed, which the + * verifier will reject later. In case it was processed already, + * we can skip this step, otherwise for all other valid cases we + * have to append exception callback now. + */ + if (subprog->sub_insn_off == 0) { + err = bpf_object__append_subprog_code(obj, prog, subprog); + if (err) + return err; + err = bpf_object__reloc_code(obj, prog, subprog); + if (err) + return err; + } + } } /* Process data relos for main programs */ for (i = 0; i < obj->nr_programs; i++) { @@ -6995,7 +7198,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj) return -LIBBPF_ERRNO__INTERNAL; } - if (idx == obj->efile.st_ops_shndx) + if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx) err = bpf_object__collect_st_ops_relos(obj, shdr, data); else if (idx == obj->efile.btf_maps_shndx) err = bpf_object__collect_map_relos(obj, shdr, data); @@ -7076,6 +7279,10 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; + /* special check for usdt to use uprobe_multi link */ + if ((def & SEC_USDT) && kernel_supports(prog->obj, FEAT_UPROBE_MULTI_LINK)) + prog->expected_attach_type = BPF_TRACE_UPROBE_MULTI; + if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { int btf_obj_fd = 0, btf_type_id = 0, err; const char *attach_name; @@ -7144,7 +7351,6 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog if (!insns || !insns_cnt) return -EINVAL; - load_attr.expected_attach_type = prog->expected_attach_type; if (kernel_supports(obj, FEAT_PROG_NAME)) prog_name = prog->name; load_attr.attach_prog_fd = prog->attach_prog_fd; @@ -7180,6 +7386,9 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog insns_cnt = prog->insns_cnt; } + /* allow prog_prepare_load_fn to change expected_attach_type */ + load_attr.expected_attach_type = prog->expected_attach_type; + if (obj->gen_loader) { bpf_gen__prog_load(obj->gen_loader, prog->type, prog->name, license, insns, insns_cnt, &load_attr, @@ -7378,13 +7587,13 @@ static void fixup_log_missing_map_load(struct bpf_program *prog, char *buf, size_t buf_sz, size_t log_sz, char *line1, char *line2, char *line3) { - /* Expected log for failed and not properly guarded CO-RE relocation: + /* Expected log for failed and not properly guarded map reference: * line1 -> 123: (85) call unknown#2001000345 * line2 -> invalid func unknown#2001000345 * line3 -> * * "123" is the index of the instruction that was poisoned. - * "345" in "2001000345" are map index in obj->maps to fetch map name. + * "345" in "2001000345" is a map index in obj->maps to fetch map name. */ struct bpf_object *obj = prog->obj; const struct bpf_map *map; @@ -7394,7 +7603,7 @@ static void fixup_log_missing_map_load(struct bpf_program *prog, if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &map_idx) != 2) return; - map_idx -= MAP_LDIMM64_POISON_BASE; + map_idx -= POISON_LDIMM64_MAP_BASE; if (map_idx < 0 || map_idx >= obj->nr_maps) return; map = &obj->maps[map_idx]; @@ -7407,6 +7616,39 @@ static void fixup_log_missing_map_load(struct bpf_program *prog, patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); } +static void fixup_log_missing_kfunc_call(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded kfunc call: + * line1 -> 123: (85) call unknown#2002000345 + * line2 -> invalid func unknown#2002000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2002000345" is an extern index in obj->externs to fetch kfunc name. + */ + struct bpf_object *obj = prog->obj; + const struct extern_desc *ext; + int insn_idx, ext_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &ext_idx) != 2) + return; + + ext_idx -= POISON_CALL_KFUNC_BASE; + if (ext_idx < 0 || ext_idx >= obj->nr_extern) + return; + ext = &obj->externs[ext_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "kfunc '%s' is referenced but wasn't resolved\n", + insn_idx, ext->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz) { /* look for familiar error patterns in last N lines of the log */ @@ -7426,23 +7668,33 @@ static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_s if (!cur_line) return; - /* failed CO-RE relocation case */ if (str_has_pfx(cur_line, "invalid func unknown#195896080\n")) { prev_line = find_prev_line(buf, cur_line); if (!prev_line) continue; + /* failed CO-RE relocation case */ fixup_log_failed_core_relo(prog, buf, buf_sz, log_sz, prev_line, cur_line, next_line); return; - } else if (str_has_pfx(cur_line, "invalid func unknown#"MAP_LDIMM64_POISON_PFX)) { + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_LDIMM64_MAP_PFX)) { prev_line = find_prev_line(buf, cur_line); if (!prev_line) continue; + /* reference to uncreated BPF map */ fixup_log_missing_map_load(prog, buf, buf_sz, log_sz, prev_line, cur_line, next_line); return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_CALL_KFUNC_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to unresolved kfunc */ + fixup_log_missing_kfunc_call(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; } } } @@ -7454,19 +7706,22 @@ static int bpf_program_record_relos(struct bpf_program *prog) for (i = 0; i < prog->nr_reloc; i++) { struct reloc_desc *relo = &prog->reloc_desc[i]; - struct extern_desc *ext = &obj->externs[relo->sym_off]; + struct extern_desc *ext = &obj->externs[relo->ext_idx]; + int kind; switch (relo->type) { - case RELO_EXTERN_VAR: + case RELO_EXTERN_LD64: if (ext->type != EXT_KSYM) continue; + kind = btf_is_var(btf__type_by_id(obj->btf, ext->btf_id)) ? + BTF_KIND_VAR : BTF_KIND_FUNC; bpf_gen__record_extern(obj->gen_loader, ext->name, ext->is_weak, !ext->ksym.type_id, - BTF_KIND_VAR, relo->insn_idx); + true, kind, relo->insn_idx); break; - case RELO_EXTERN_FUNC: + case RELO_EXTERN_CALL: bpf_gen__record_extern(obj->gen_loader, ext->name, - ext->is_weak, false, BTF_KIND_FUNC, + ext->is_weak, false, false, BTF_KIND_FUNC, relo->insn_idx); break; case RELO_CORE: { @@ -7705,7 +7960,7 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) if (!bpf_map__is_internal(m)) continue; if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) - m->def.map_flags ^= BPF_F_MMAPABLE; + m->def.map_flags &= ~BPF_F_MMAPABLE; } return 0; @@ -7718,7 +7973,7 @@ int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) int ret, err = 0; FILE *f; - f = fopen("/proc/kallsyms", "r"); + f = fopen("/proc/kallsyms", "re"); if (!f) { err = -errno; pr_warn("failed to open /proc/kallsyms: %d\n", err); @@ -7876,7 +8131,8 @@ static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, local_func_proto_id = ext->ksym.type_id; - kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, &kern_btf, &mod_btf); + kfunc_id = find_ksym_btf_id(obj, ext->essent_name ?: ext->name, BTF_KIND_FUNC, &kern_btf, + &mod_btf); if (kfunc_id < 0) { if (kfunc_id == -ESRCH && ext->is_weak) return 0; @@ -7891,8 +8147,12 @@ static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, kern_btf, kfunc_proto_id); if (ret <= 0) { - pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", - ext->name, local_func_proto_id, kfunc_proto_id); + if (ext->is_weak) + return 0; + + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with %s [%d]\n", + ext->name, local_func_proto_id, + mod_btf ? mod_btf->name : "vmlinux", kfunc_proto_id); return -EINVAL; } @@ -7920,8 +8180,14 @@ static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, ext->is_set = true; ext->ksym.kernel_btf_id = kfunc_id; ext->ksym.btf_fd_idx = mod_btf ? mod_btf->fd_array_idx : 0; - pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", - ext->name, kfunc_id); + /* Also set kernel_btf_obj_fd to make sure that bpf_object__relocate_data() + * populates FD into ld_imm64 insn when it's used to point to kfunc. + * {kernel_btf_id, btf_fd_idx} -> fixup bpf_call. + * {kernel_btf_id, kernel_btf_obj_fd} -> fixup ld_imm64. + */ + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + pr_debug("extern (func ksym) '%s': resolved to %s [%d]\n", + ext->name, mod_btf ? mod_btf->name : "vmlinux", kfunc_id); return 0; } @@ -8064,6 +8330,37 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, return 0; } +static void bpf_map_prepare_vdata(const struct bpf_map *map) +{ + struct bpf_struct_ops *st_ops; + __u32 i; + + st_ops = map->st_ops; + for (i = 0; i < btf_vlen(st_ops->type); i++) { + struct bpf_program *prog = st_ops->progs[i]; + void *kern_data; + int prog_fd; + + if (!prog) + continue; + + prog_fd = bpf_program__fd(prog); + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; + *(unsigned long *)kern_data = prog_fd; + } +} + +static int bpf_object_prepare_struct_ops(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_maps; i++) + if (bpf_map__is_struct_ops(&obj->maps[i])) + bpf_map_prepare_vdata(&obj->maps[i]); + + return 0; +} + static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) { int err, i; @@ -8089,6 +8386,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); err = err ? : bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); + err = err ? : bpf_object_prepare_struct_ops(obj); if (obj->gen_loader) { /* reset FDs */ @@ -8529,6 +8827,21 @@ int bpf_object__pin(struct bpf_object *obj, const char *path) return 0; } +int bpf_object__unpin(struct bpf_object *obj, const char *path) +{ + int err; + + err = bpf_object__unpin_programs(obj, path); + if (err) + return libbpf_err(err); + + err = bpf_object__unpin_maps(obj, path); + if (err) + return libbpf_err(err); + + return 0; +} + static void bpf_map__destroy(struct bpf_map *map) { if (map->inner_map) { @@ -8540,7 +8853,10 @@ static void bpf_map__destroy(struct bpf_map *map) map->init_slots_sz = 0; if (map->mmaped) { - munmap(map->mmaped, bpf_map_mmap_sz(map)); + size_t mmap_sz; + + mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + munmap(map->mmaped, mmap_sz); map->mmaped = NULL; } @@ -8573,6 +8889,7 @@ void bpf_object__close(struct bpf_object *obj) bpf_object__elf_finish(obj); bpf_object_unload(obj); btf__free(obj->btf); + btf__free(obj->btf_vmlinux); btf_ext__free(obj->btf_ext); for (i = 0; i < obj->nr_maps; i++) @@ -8580,6 +8897,10 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->btf_custom_path); zfree(&obj->kconfig); + + for (i = 0; i < obj->nr_extern; i++) + zfree(&obj->externs[i].essent_name); + zfree(&obj->externs); obj->nr_extern = 0; @@ -8749,7 +9070,8 @@ int bpf_program__set_insns(struct bpf_program *prog, return -EBUSY; insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); - if (!insns) { + /* NULL is a valid return from reallocarray if the new count is zero */ + if (!insns && new_insn_cnt) { pr_warn("prog '%s': failed to realloc prog code\n", prog->name); return -ENOMEM; } @@ -8779,12 +9101,31 @@ enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) return prog->type; } +static size_t custom_sec_def_cnt; +static struct bpf_sec_def *custom_sec_defs; +static struct bpf_sec_def custom_fallback_def; +static bool has_custom_fallback_def; +static int last_custom_sec_def_handler_id; + int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) { if (prog->obj->loaded) return libbpf_err(-EBUSY); + /* if type is not changed, do nothing */ + if (prog->type == type) + return 0; + prog->type = type; + + /* If a program type was changed, we need to reset associated SEC() + * handler, as it will be invalid now. The only exception is a generic + * fallback handler, which by definition is program type-agnostic and + * is a catch-all custom handler, optionally set by the application, + * so should be able to handle any type of BPF program. + */ + if (prog->sec_def != &custom_fallback_def) + prog->sec_def = NULL; return 0; } @@ -8873,6 +9214,7 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -8888,14 +9230,25 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), + SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), + SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), + SEC_DEF("uretprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), SEC_DEF("ksyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), SEC_DEF("kretsyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), #ifdef HAVE_LIBELF SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), + SEC_DEF("usdt.s+", KPROBE, 0, SEC_USDT | SEC_SLEEPABLE, attach_usdt), #endif //HAVE_LIBELF - SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), - SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), - SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), + SEC_DEF("tc/ingress", SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE), /* alias for tcx */ + SEC_DEF("tc/egress", SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE), /* alias for tcx */ + SEC_DEF("tcx/ingress", SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE), + SEC_DEF("tcx/egress", SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE), + SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */ + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */ + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), /* deprecated / legacy, use tcx */ + SEC_DEF("netkit/primary", SCHED_CLS, BPF_NETKIT_PRIMARY, SEC_NONE), + SEC_DEF("netkit/peer", SCHED_CLS, BPF_NETKIT_PEER, SEC_NONE), SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), @@ -8946,29 +9299,29 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_CONNECT, SEC_ATTACHABLE), SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_SENDMSG, SEC_ATTACHABLE), SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_RECVMSG, SEC_ATTACHABLE), SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_GETPEERNAME, SEC_ATTACHABLE), SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_GETSOCKNAME, SEC_ATTACHABLE), SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), + SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), + SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), }; -static size_t custom_sec_def_cnt; -static struct bpf_sec_def *custom_sec_defs; -static struct bpf_sec_def custom_fallback_def; -static bool has_custom_fallback_def; - -static int last_custom_sec_def_handler_id; - int libbpf_register_prog_handler(const char *sec, enum bpf_prog_type prog_type, enum bpf_attach_type exp_attach_type, @@ -9048,7 +9401,11 @@ int libbpf_unregister_prog_handler(int handler_id) /* try to shrink the array, but it's ok if we couldn't */ sec_defs = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt, sizeof(*sec_defs)); - if (sec_defs) + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (sec_defs || custom_sec_def_cnt == 0) custom_sec_defs = sec_defs; return 0; @@ -9201,6 +9558,7 @@ const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) } static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, + int sec_idx, size_t offset) { struct bpf_map *map; @@ -9210,7 +9568,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, map = &obj->maps[i]; if (!bpf_map__is_struct_ops(map)) continue; - if (map->sec_offset <= offset && + if (map->sec_idx == sec_idx && + map->sec_offset <= offset && offset - map->sec_offset < map->def.value_size) return map; } @@ -9252,7 +9611,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } name = elf_sym_str(obj, sym->st_name) ?: ""; - map = find_struct_ops_map_by_offset(obj, rel->r_offset); + map = find_struct_ops_map_by_offset(obj, shdr->sh_info, rel->r_offset); if (!map) { pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n", (size_t)rel->r_offset); @@ -9319,8 +9678,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } /* struct_ops BPF prog can be re-used between multiple - * .struct_ops as long as it's the same struct_ops struct - * definition and the same function pointer field + * .struct_ops & .struct_ops.link as long as it's the + * same struct_ops struct definition and the same + * function pointer field */ if (prog->attach_btf_id != st_ops->type_id || prog->expected_attach_type != member_idx) { @@ -9420,9 +9780,9 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) int err; memset(&info, 0, info_len); - err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_len); + err = bpf_prog_get_info_by_fd(attach_prog_fd, &info, &info_len); if (err) { - pr_warn("failed bpf_obj_get_info_by_fd for FD %d: %d\n", + pr_warn("failed bpf_prog_get_info_by_fd for FD %d: %d\n", attach_prog_fd, err); return err; } @@ -9656,10 +10016,103 @@ __u32 bpf_map__value_size(const struct bpf_map *map) return map->def.value_size; } +static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) +{ + struct btf *btf; + struct btf_type *datasec_type, *var_type; + struct btf_var_secinfo *var; + const struct btf_type *array_type; + const struct btf_array *array; + int vlen, element_sz, new_array_id; + __u32 nr_elements; + + /* check btf existence */ + btf = bpf_object__btf(map->obj); + if (!btf) + return -ENOENT; + + /* verify map is datasec */ + datasec_type = btf_type_by_id(btf, bpf_map__btf_value_type_id(map)); + if (!btf_is_datasec(datasec_type)) { + pr_warn("map '%s': cannot be resized, map value type is not a datasec\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify datasec has at least one var */ + vlen = btf_vlen(datasec_type); + if (vlen == 0) { + pr_warn("map '%s': cannot be resized, map value datasec is empty\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify last var in the datasec is an array */ + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + array_type = skip_mods_and_typedefs(btf, var_type->type, NULL); + if (!btf_is_array(array_type)) { + pr_warn("map '%s': cannot be resized, last var must be an array\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify request size aligns with array */ + array = btf_array(array_type); + element_sz = btf__resolve_size(btf, array->type); + if (element_sz <= 0 || (size - var->offset) % element_sz != 0) { + pr_warn("map '%s': cannot be resized, element size (%d) doesn't align with new total size (%u)\n", + bpf_map__name(map), element_sz, size); + return -EINVAL; + } + + /* create a new array based on the existing array, but with new length */ + nr_elements = (size - var->offset) / element_sz; + new_array_id = btf__add_array(btf, array->index_type, array->type, nr_elements); + if (new_array_id < 0) + return new_array_id; + + /* adding a new btf type invalidates existing pointers to btf objects, + * so refresh pointers before proceeding + */ + datasec_type = btf_type_by_id(btf, map->btf_value_type_id); + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + + /* finally update btf info */ + datasec_type->size = size; + var->size = size - var->offset; + var_type->type = new_array_id; + + return 0; +} + int bpf_map__set_value_size(struct bpf_map *map, __u32 size) { if (map->fd >= 0) return libbpf_err(-EBUSY); + + if (map->mmaped) { + int err; + size_t mmap_old_sz, mmap_new_sz; + + mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries); + err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz); + if (err) { + pr_warn("map '%s': failed to resize memory-mapped region: %d\n", + bpf_map__name(map), err); + return err; + } + err = map_btf_datasec_resize(map, size); + if (err && err != -ENOENT) { + pr_warn("map '%s': failed to adjust resized BTF, clearing BTF key/value info: %d\n", + bpf_map__name(map), err); + map->btf_value_type_id = 0; + map->btf_key_type_id = 0; + } + } + map->def.value_size = size; return 0; } @@ -9685,7 +10138,7 @@ int bpf_map__set_initial_value(struct bpf_map *map, return 0; } -const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) { if (!map->mmaped) return NULL; @@ -10114,6 +10567,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p char errmsg[STRERR_BUFSIZE]; struct bpf_link_perf *link; int prog_fd, link_fd = -1, err; + bool force_ioctl_attach; if (!OPTS_VALID(opts, bpf_perf_event_opts)) return libbpf_err_ptr(-EINVAL); @@ -10137,7 +10591,8 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p link->link.dealloc = &bpf_link_perf_dealloc; link->perf_event_fd = pfd; - if (kernel_supports(prog->obj, FEAT_PERF_LINK)) { + force_ioctl_attach = OPTS_GET(opts, force_ioctl_attach, false); + if (kernel_supports(prog->obj, FEAT_PERF_LINK) && !force_ioctl_attach) { DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts, .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0)); @@ -10199,7 +10654,7 @@ static int parse_uint_from_file(const char *file, const char *fmt) int err, ret; FILE *f; - f = fopen(file, "r"); + f = fopen(file, "re"); if (!f) { err = -errno; pr_debug("failed to open '%s': %s\n", file, @@ -10300,16 +10755,20 @@ static int append_to_file(const char *file, const char *fmt, ...) { int fd, n, err = 0; va_list ap; + char buf[1024]; + + va_start(ap, fmt); + n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + if (n < 0 || n >= sizeof(buf)) + return -EINVAL; fd = open(file, O_WRONLY | O_APPEND | O_CLOEXEC, 0); if (fd < 0) return -errno; - va_start(ap, fmt); - n = vdprintf(fd, fmt, ap); - va_end(ap); - - if (n < 0) + if (write(fd, buf, n) < 0) err = -errno; close(fd); @@ -10344,13 +10803,32 @@ static const char *tracefs_uprobe_events(void) return use_debugfs() ? DEBUGFS"/uprobe_events" : TRACEFS"/uprobe_events"; } +static const char *tracefs_available_filter_functions(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions" + : TRACEFS"/available_filter_functions"; +} + +static const char *tracefs_available_filter_functions_addrs(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions_addrs" + : TRACEFS"/available_filter_functions_addrs"; +} + static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, const char *kfunc_name, size_t offset) { static int index = 0; + int i; snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset, __sync_fetch_and_add(&index, 1)); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } } static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, @@ -10489,6 +10967,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, const struct bpf_kprobe_opts *opts) { DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; char errmsg[STRERR_BUFSIZE]; char *legacy_probe = NULL; struct bpf_link *link; @@ -10499,11 +10978,32 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_kprobe_opts)) return libbpf_err_ptr(-EINVAL); + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); retprobe = OPTS_GET(opts, retprobe, false); offset = OPTS_GET(opts, offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); legacy = determine_kprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + if (!legacy) { pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, offset, @@ -10597,7 +11097,7 @@ struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog, } /* Adapted from perf/util/string.c */ -static bool glob_match(const char *str, const char *pat) +bool glob_match(const char *str, const char *pat) { while (*str && *pat && *pat != '*') { if (*pat == '?') { /* Matches any single character */ @@ -10630,25 +11130,158 @@ struct kprobe_multi_resolve { size_t cnt; }; -static int -resolve_kprobe_multi_cb(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx) +struct avail_kallsyms_data { + char **syms; + size_t cnt; + struct kprobe_multi_resolve *res; +}; + +static int avail_func_cmp(const void *a, const void *b) { - struct kprobe_multi_resolve *res = ctx; + return strcmp(*(const char **)a, *(const char **)b); +} + +static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct avail_kallsyms_data *data = ctx; + struct kprobe_multi_resolve *res = data->res; int err; - if (!glob_match(sym_name, res->pattern)) + if (!bsearch(&sym_name, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp)) return 0; - err = libbpf_ensure_mem((void **) &res->addrs, &res->cap, sizeof(unsigned long), - res->cnt + 1); + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, sizeof(*res->addrs), res->cnt + 1); if (err) return err; - res->addrs[res->cnt++] = (unsigned long) sym_addr; + res->addrs[res->cnt++] = (unsigned long)sym_addr; return 0; } +static int libbpf_available_kallsyms_parse(struct kprobe_multi_resolve *res) +{ + const char *available_functions_file = tracefs_available_filter_functions(); + struct avail_kallsyms_data data; + char sym_name[500]; + FILE *f; + int err = 0, ret, i; + char **syms = NULL; + size_t cap = 0, cnt = 0; + + f = fopen(available_functions_file, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_functions_file, err); + return err; + } + + while (true) { + char *name; + + ret = fscanf(f, "%499s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 1) { + pr_warn("failed to parse available_filter_functions entry: %d\n", ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&syms, &cap, sizeof(*syms), cnt + 1); + if (err) + goto cleanup; + + name = strdup(sym_name); + if (!name) { + err = -errno; + goto cleanup; + } + + syms[cnt++] = name; + } + + /* no entries found, bail out */ + if (cnt == 0) { + err = -ENOENT; + goto cleanup; + } + + /* sort available functions */ + qsort(syms, cnt, sizeof(*syms), avail_func_cmp); + + data.syms = syms; + data.res = res; + data.cnt = cnt; + libbpf_kallsyms_parse(avail_kallsyms_cb, &data); + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + for (i = 0; i < cnt; i++) + free((char *)syms[i]); + free(syms); + + fclose(f); + return err; +} + +static bool has_available_filter_functions_addrs(void) +{ + return access(tracefs_available_filter_functions_addrs(), R_OK) != -1; +} + +static int libbpf_available_kprobes_parse(struct kprobe_multi_resolve *res) +{ + const char *available_path = tracefs_available_filter_functions_addrs(); + char sym_name[500]; + FILE *f; + int ret, err = 0; + unsigned long long sym_addr; + + f = fopen(available_path, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_path, err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %499s%*[^\n]\n", &sym_addr, sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 2) { + pr_warn("failed to parse available_filter_functions_addrs entry: %d\n", + ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, + sizeof(*res->addrs), res->cnt + 1); + if (err) + goto cleanup; + + res->addrs[res->cnt++] = (unsigned long)sym_addr; + } + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + fclose(f); + return err; +} + struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, const char *pattern, @@ -10685,13 +11318,12 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (pattern) { - err = libbpf_kallsyms_parse(resolve_kprobe_multi_cb, &res); + if (has_available_filter_functions_addrs()) + err = libbpf_available_kprobes_parse(&res); + else + err = libbpf_available_kallsyms_parse(&res); if (err) goto error; - if (!res.cnt) { - err = -ENOENT; - goto error; - } addrs = res.addrs; cnt = res.cnt; } @@ -10818,6 +11450,37 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru return libbpf_get_error(*link); } +static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + int n, ret = -EINVAL; + + *link = NULL; + + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[^\n]", + &probe_type, &binary_path, &func_name); + switch (n) { + case 1: + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + ret = 0; + break; + case 3: + opts.retprobe = strcmp(probe_type, "uretprobe.multi") == 0; + *link = bpf_program__attach_uprobe_multi(prog, -1, binary_path, func_name, &opts); + ret = libbpf_get_error(*link); + break; + default: + pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name, + prog->sec_name); + break; + } + free(probe_type); + free(binary_path); + free(func_name); + return ret; +} + static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, const char *binary_path, uint64_t offset) { @@ -10900,283 +11563,78 @@ err_clean_legacy: return err; } -/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ -#ifdef HAVE_LIBELF -static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) -{ - while ((scn = elf_nextscn(elf, scn)) != NULL) { - GElf_Shdr sh; - - if (!gelf_getshdr(scn, &sh)) - continue; - if (sh.sh_type == sh_type) - return scn; - } - return NULL; -} -#elif HAVE_ELFIO -static psection_t elf_find_next_scn_by_type(pelfio_t pelfio, int sh_type, psection_t pscn) -{ - int secno = elfio_get_sections_num(pelfio); - int j = 0; - if (pscn != NULL) { - for (int i = 0; i < secno; i++) { - psection_t psection = elfio_get_section_by_index(pelfio, i); - if (psection == pscn) { - j = i; - } - } - } - for (; j < secno; j++) { - psection_t psection = elfio_get_section_by_index(pelfio, j); - Elf_Word sec_type = elfio_section_get_type(psection); - if (sec_type == sh_type) { - return psection; - } - } - return NULL; -} -#endif - /* Find offset of function name in object specified by path. "name" matches * symbol name or name@@LIB for library functions. */ -static long elf_find_func_offset(const char *binary_path, const char *name) +static long elf_find_func_offset_from_archive(const char *archive_path, const char *file_name, + const char *func_name) { - int fd, i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; - bool is_shared_lib, is_name_qualified; - char errmsg[STRERR_BUFSIZE]; - long ret = -ENOENT; - size_t name_len; -#ifdef HAVE_LIBELF - GElf_Ehdr ehdr; + struct zip_archive *archive; + struct zip_entry entry; + long ret; +#ifdef HAVE_LIBELF Elf *elf; -#elif HAVE_ELFIO - pelfio_t pelfio = elfio_new(); - bool is_load = elfio_load(pelfio, binary_path); -#endif // -#ifdef HAVE_LIBELF - fd = open(binary_path, O_RDONLY | O_CLOEXEC); - if (fd < 0) { -#elif HAVE_ELFIO - if (!is_load) { -#endif // - ret = -errno; - pr_warn("failed to open %s: %s\n", binary_path, - libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); +#elif defined HAVE_ELFIO + pelfio_t elf; +#endif + + archive = zip_archive_open(archive_path); + if (IS_ERR(archive)) { + ret = PTR_ERR(archive); + pr_warn("zip: failed to open %s: %ld\n", archive_path, ret); return ret; } -#ifdef HAVE_LIBELF - elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); - if (!elf) { - pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); - close(fd); - return -LIBBPF_ERRNO__FORMAT; + + ret = zip_archive_find_entry(archive, file_name, &entry); + if (ret) { + pr_warn("zip: could not find archive member %s in %s: %ld\n", file_name, + archive_path, ret); + goto out; } - if (!gelf_getehdr(elf, &ehdr)) { - pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + pr_debug("zip: found entry for %s in %s at 0x%lx\n", file_name, archive_path, + (unsigned long)entry.data_offset); + + if (entry.compression) { + pr_warn("zip: entry %s of %s is compressed and cannot be handled\n", file_name, + archive_path); ret = -LIBBPF_ERRNO__FORMAT; goto out; } - /* for shared lib case, we do not need to calculate relative offset */ - is_shared_lib = ehdr.e_type == ET_DYN; -#elif HAVA_ELFIO - is_shared_lib = (ET_DYN == elfio_get_type(pelfio)); -#endif //HAVE_LIBELF - - name_len = strlen(name); - /* Does name specify "@@LIB"? */ - is_name_qualified = strstr(name, "@@") != NULL; - - /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if - * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically - * linked binary may not have SHT_DYMSYM, so absence of a section should not be - * reported as a warning/error. - */ - for (i = 0; i < ARRAY_SIZE(sh_types); i++) { - size_t nr_syms, strtabidx, idx; -#ifdef HAVE_LIBELF - Elf_Data *symbols = NULL; - Elf_Scn *scn = NULL; - GElf_Shdr sh; -#elif HAVE_ELFIO - Elf_Data realData = {}; - psection_t pSec = NULL; -#endif //HAVE_LIBELF - int last_bind = -1; - const char *sname; -#ifdef HAVE_LIBELF - scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL); -#elif HAVE_ELFIO - pSec = elf_find_next_scn_by_type(pelfio, sh_types[i], NULL); -#endif //HAVE_LIBELF -#ifdef HAVE_LIBELF - if (!scn) { -#elif HAVE_ELFIO - if (!pSec) { -#endif //HAVE_LIBELF - pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", - binary_path); - continue; - } -#ifdef HAVE_LIBELF - if (!gelf_getshdr(scn, &sh)) - continue; - strtabidx = sh.sh_link; -#elif HAVE_ELFIO - strtabidx = elfio_section_get_link(pSec); -#endif // -#ifdef HAVE_LIBELF - symbols = elf_getdata(scn, 0); - if (!symbols) { - pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", - binary_path, elf_errmsg(-1)); - ret = -LIBBPF_ERRNO__FORMAT; - goto out; - } - nr_syms = symbols->d_size / sh.sh_entsize; -#elif HAVE_ELFIO - realData.d_buf = (void*)elfio_section_get_data(pSec); - realData.d_size = elfio_section_get_size(pSec); - if (!realData.d_buf) { - pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", - binary_path, elf_errmsg(-1)); - ret = -LIBBPF_ERRNO__FORMAT; - goto out; - } - nr_syms = realData.d_size / elfio_section_get_entry_size(pSec); - psymbol_t psymbols; - psymbols = elfio_symbol_section_accessor_new(pelfio, pSec); - if (!psymbols) { - pr_warn("elf: failed to get symbols in '%s': %s\n", - binary_path, elf_errmsg(-1)); - ret = -LIBBPF_ERRNO__FORMAT; - goto out; - } -#endif //HAVE_LIBELF - for (idx = 0; idx < nr_syms; idx++) { - int curr_bind; -#ifdef HAVE_LIBELF - GElf_Sym sym; - Elf_Scn *sym_scn; - GElf_Shdr sym_sh; - - if (!gelf_getsym(symbols, idx, &sym)) - continue; - - if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) - continue; - - sname = elf_strptr(elf, strtabidx, sym.st_name); - if (!sname) - continue; - - curr_bind = GELF_ST_BIND(sym.st_info); -#elif HAVE_ELFIO - char sname[128] = {0}; - Elf64_Addr st_value; - Elf_Xword size; - unsigned char bind; - unsigned char type; - Elf_Half section_index; - unsigned char other; - elfio_symbol_get_symbol( psymbols, idx, sname, 128, &st_value, &size, &bind, - &type, §ion_index, &other ); - if (type == STT_FUNC) - continue; - if (sname[0] == '\0') - continue; - curr_bind = bind; -#endif //HAVE_LIBELF - - - /* User can specify func, func@@LIB or func@@LIB_VERSION. */ - if (strncmp(sname, name, name_len) != 0) - continue; - /* ...but we don't want a search for "foo" to match 'foo2" also, so any - * additional characters in sname should be of the form "@@LIB". - */ - if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@') - continue; - - if (ret >= 0) { - /* handle multiple matches */ - if (last_bind != STB_WEAK && curr_bind != STB_WEAK) { - /* Only accept one non-weak bind. */ - pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", - sname, name, binary_path); - ret = -LIBBPF_ERRNO__FORMAT; - goto out; - } else if (curr_bind == STB_WEAK) { - /* already have a non-weak bind, and - * this is a weak bind, so ignore. - */ - continue; - } - } - - /* Transform symbol's virtual address (absolute for - * binaries and relative for shared libs) into file - * offset, which is what kernel is expecting for - * uprobe/uretprobe attachment. - * See Documentation/trace/uprobetracer.rst for more - * details. - * This is done by looking up symbol's containing - * section's header and using it's virtual address - * (sh_addr) and corresponding file offset (sh_offset) - * to transform sym.st_value (virtual address) into - * desired final file offset. - */ -#ifdef HAVE_LIBELF - sym_scn = elf_getscn(elf, sym.st_shndx); - if (!sym_scn) - continue; - if (!gelf_getshdr(sym_scn, &sym_sh)) - continue; - ret = sym.st_value - sym_sh.sh_addr + sym_sh.sh_offset; -#elif HAVE_ELFIO - psection_t pSymSection= NULL; - pSymSection = elfio_get_section_by_index(pelfio,section_index); - if (!pSymSection) - continue; - Elf64_Addr sh_addr = elfio_section_get_address(pSymSection); - Elf64_Off sh_offset = elfio_section_get_offset(pSymSection); - ret = st_value - sh_addr + sh_offset; -#endif //HAVE_LIBELF - last_bind = curr_bind; - } -#ifdef HAVE_ELFIO - elfio_symbol_section_accessor_delete(psymbols); -#endif //HAVE_ELFIO - if (ret > 0) - break; +#ifdef HAVE_LIBELF + elf = elf_memory((void *)entry.data, entry.data_length); +#elif defined HAVE_ELFIO + char memfd_path[PATH_MAX] = {0}; + elf = elfio_new(); + int fdm = syscall(__NR_memfd_create, "bpfelf", MFD_CLOEXEC); + ftruncate(fdm, entry.data_length); + write(fdm, (char *)entry.data, entry.data_length); + snprintf(memfd_path, PATH_MAX, "/proc/self/fd/%d", fdm); + elfio_load(elf, memfd_path); +#endif + if (!elf) { + pr_warn("elf: could not read elf file %s from %s: %s\n", file_name, archive_path, + elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__LIBELF; + goto out; } + ret = elf_find_func_offset(elf, file_name, func_name); if (ret > 0) { - pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, - ret); - } else { - if (ret == 0) { - pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, - is_shared_lib ? "should not be 0 in a shared library" : - "try using shared library path instead"); - ret = -ENOENT; - } else { - pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); - } + pr_debug("elf: symbol address match for %s of %s in %s: 0x%x + 0x%lx = 0x%lx\n", + func_name, file_name, archive_path, entry.data_offset, ret, + ret + entry.data_offset); + ret += entry.data_offset; } -out: -#ifdef HAVE_LIBELF +#ifdef HAVA_LIBELF elf_end(elf); - close(fd); -#elif HAVE_ELFIO - elfio_delete(pelfio); -#endif //HAVE_LIBELF +#elif HAVA_ELFIO + elfio_delete(elf); +#endif +out: + zip_archive_close(archive); return ret; } - - static const char *arch_specific_lib_paths(void) { /* @@ -11257,14 +11715,130 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) return -ENOENT; } +struct bpf_link * +bpf_program__attach_uprobe_multi(const struct bpf_program *prog, + pid_t pid, + const char *path, + const char *func_pattern, + const struct bpf_uprobe_multi_opts *opts) +{ + const unsigned long *ref_ctr_offsets = NULL, *offsets = NULL; + LIBBPF_OPTS(bpf_link_create_opts, lopts); + unsigned long *resolved_offsets = NULL; + int err = 0, link_fd, prog_fd; + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + char full_path[PATH_MAX]; + const __u64 *cookies; + const char **syms; + size_t cnt; + + if (!OPTS_VALID(opts, bpf_uprobe_multi_opts)) + return libbpf_err_ptr(-EINVAL); + + syms = OPTS_GET(opts, syms, NULL); + offsets = OPTS_GET(opts, offsets, NULL); + ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL); + cookies = OPTS_GET(opts, cookies, NULL); + cnt = OPTS_GET(opts, cnt, 0); + + /* + * User can specify 2 mutually exclusive set of inputs: + * + * 1) use only path/func_pattern/pid arguments + * + * 2) use path/pid with allowed combinations of: + * syms/offsets/ref_ctr_offsets/cookies/cnt + * + * - syms and offsets are mutually exclusive + * - ref_ctr_offsets and cookies are optional + * + * Any other usage results in error. + */ + + if (!path) + return libbpf_err_ptr(-EINVAL); + if (!func_pattern && cnt == 0) + return libbpf_err_ptr(-EINVAL); + + if (func_pattern) { + if (syms || offsets || ref_ctr_offsets || cookies || cnt) + return libbpf_err_ptr(-EINVAL); + } else { + if (!!syms == !!offsets) + return libbpf_err_ptr(-EINVAL); + } + + if (func_pattern) { + if (!strchr(path, '/')) { + err = resolve_full_path(path, full_path, sizeof(full_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, path, err); + return libbpf_err_ptr(err); + } + path = full_path; + } + + err = elf_resolve_pattern_offsets(path, func_pattern, + &resolved_offsets, &cnt); + if (err < 0) + return libbpf_err_ptr(err); + offsets = resolved_offsets; + } else if (syms) { + err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets); + if (err < 0) + return libbpf_err_ptr(err); + offsets = resolved_offsets; + } + + lopts.uprobe_multi.path = path; + lopts.uprobe_multi.offsets = offsets; + lopts.uprobe_multi.ref_ctr_offsets = ref_ctr_offsets; + lopts.uprobe_multi.cookies = cookies; + lopts.uprobe_multi.cnt = cnt; + lopts.uprobe_multi.flags = OPTS_GET(opts, retprobe, false) ? BPF_F_UPROBE_MULTI_RETURN : 0; + + if (pid == 0) + pid = getpid(); + if (pid > 0) + lopts.uprobe_multi.pid = pid; + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto error; + } + link->detach = &bpf_link__detach_fd; + + prog_fd = bpf_program__fd(prog); + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach multi-uprobe: %s\n", + prog->name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto error; + } + link->fd = link_fd; + free(resolved_offsets); + return link; + +error: + free(resolved_offsets); + free(link); + return libbpf_err_ptr(err); +} + LIBBPF_API struct bpf_link * bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, const char *binary_path, size_t func_offset, const struct bpf_uprobe_opts *opts) { - DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + const char *archive_path = NULL, *archive_sep = NULL; char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL; - char full_binary_path[PATH_MAX]; + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; + char full_path[PATH_MAX]; struct bpf_link *link; size_t ref_ctr_off; int pfd, err; @@ -11274,6 +11848,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, if (!OPTS_VALID(opts, bpf_uprobe_opts)) return libbpf_err_ptr(-EINVAL); + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); retprobe = OPTS_GET(opts, retprobe, false); ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); @@ -11281,27 +11856,60 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, if (!binary_path) return libbpf_err_ptr(-EINVAL); - if (!strchr(binary_path, '/')) { - err = resolve_full_path(binary_path, full_binary_path, - sizeof(full_binary_path)); + /* Check if "binary_path" refers to an archive. */ + archive_sep = strstr(binary_path, "!/"); + if (archive_sep) { + full_path[0] = '\0'; + libbpf_strlcpy(full_path, binary_path, + min(sizeof(full_path), (size_t)(archive_sep - binary_path + 1))); + archive_path = full_path; + binary_path = archive_sep + 2; + } else if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, full_path, sizeof(full_path)); if (err) { pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", prog->name, binary_path, err); return libbpf_err_ptr(err); } - binary_path = full_binary_path; + binary_path = full_path; } func_name = OPTS_GET(opts, func_name, NULL); if (func_name) { long sym_off; - sym_off = elf_find_func_offset(binary_path, func_name); + if (archive_path) { + sym_off = elf_find_func_offset_from_archive(archive_path, binary_path, + func_name); + binary_path = archive_path; + } else { + sym_off = elf_find_func_offset_from_file(binary_path, func_name); + } if (sym_off < 0) return libbpf_err_ptr(sym_off); func_offset += sym_off; } legacy = determine_uprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + if (!legacy) { pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid, ref_ctr_off); @@ -11370,14 +11978,14 @@ err_out: static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) { DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); - char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; - int n, ret = -EINVAL; + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL, *func_off; + int n, c, ret = -EINVAL; long offset = 0; *link = NULL; - n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li", - &probe_type, &binary_path, &func_name, &offset); + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[^\n]", + &probe_type, &binary_path, &func_name); switch (n) { case 1: /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ @@ -11388,7 +11996,17 @@ static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf prog->name, prog->sec_name); break; case 3: - case 4: + /* check if user specifies `+offset`, if yes, this should be + * the last part of the string, make sure sscanf read to EOL + */ + func_off = strrchr(func_name, '+'); + if (func_off) { + n = sscanf(func_off, "+%li%n", &offset, &c); + if (n == 1 && *(func_off + c) == '\0') + func_off[0] = '\0'; + else + offset = 0; + } opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || strcmp(probe_type, "uretprobe.s") == 0; if (opts.retprobe && offset != 0) { @@ -11769,11 +12387,10 @@ static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_li } static struct bpf_link * -bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id, - const char *target_name) +bpf_program_attach_fd(const struct bpf_program *prog, + int target_fd, const char *target_name, + const struct bpf_link_create_opts *opts) { - DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, - .target_btf_id = btf_id); enum bpf_attach_type attach_type; char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; @@ -11791,7 +12408,7 @@ bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id link->detach = &bpf_link__detach_fd; attach_type = bpf_program__expected_attach_type(prog); - link_fd = bpf_link_create(prog_fd, target_fd, attach_type, &opts); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, opts); if (link_fd < 0) { link_fd = -errno; free(link); @@ -11807,19 +12424,88 @@ bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id struct bpf_link * bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd) { - return bpf_program__attach_fd(prog, cgroup_fd, 0, "cgroup"); + return bpf_program_attach_fd(prog, cgroup_fd, "cgroup", NULL); } struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) { - return bpf_program__attach_fd(prog, netns_fd, 0, "netns"); + return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); } struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) { /* target_fd/target_ifindex use the same field in LINK_CREATE */ - return bpf_program__attach_fd(prog, ifindex, 0, "xdp"); + return bpf_program_attach_fd(prog, ifindex, "xdp", NULL); +} + +struct bpf_link * +bpf_program__attach_tcx(const struct bpf_program *prog, int ifindex, + const struct bpf_tcx_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + __u32 relative_id; + int relative_fd; + + if (!OPTS_VALID(opts, bpf_tcx_opts)) + return libbpf_err_ptr(-EINVAL); + + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (!ifindex) { + pr_warn("prog '%s': target netdevice ifindex cannot be zero\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + if (relative_fd && relative_id) { + pr_warn("prog '%s': relative_fd and relative_id cannot be set at the same time\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link_create_opts.tcx.expected_revision = OPTS_GET(opts, expected_revision, 0); + link_create_opts.tcx.relative_fd = relative_fd; + link_create_opts.tcx.relative_id = relative_id; + link_create_opts.flags = OPTS_GET(opts, flags, 0); + + /* target_fd/target_ifindex use the same field in LINK_CREATE */ + return bpf_program_attach_fd(prog, ifindex, "tcx", &link_create_opts); +} + +struct bpf_link * +bpf_program__attach_netkit(const struct bpf_program *prog, int ifindex, + const struct bpf_netkit_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + __u32 relative_id; + int relative_fd; + + if (!OPTS_VALID(opts, bpf_netkit_opts)) + return libbpf_err_ptr(-EINVAL); + + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (!ifindex) { + pr_warn("prog '%s': target netdevice ifindex cannot be zero\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + if (relative_fd && relative_id) { + pr_warn("prog '%s': relative_fd and relative_id cannot be set at the same time\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link_create_opts.netkit.expected_revision = OPTS_GET(opts, expected_revision, 0); + link_create_opts.netkit.relative_fd = relative_fd; + link_create_opts.netkit.relative_id = relative_id; + link_create_opts.flags = OPTS_GET(opts, flags, 0); + + return bpf_program_attach_fd(prog, ifindex, "netkit", &link_create_opts); } struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, @@ -11841,11 +12527,16 @@ struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, } if (target_fd) { + LIBBPF_OPTS(bpf_link_create_opts, target_opts); + btf_id = libbpf_find_prog_btf_id(attach_func_name, target_fd); if (btf_id < 0) return libbpf_err_ptr(btf_id); - return bpf_program__attach_fd(prog, target_fd, btf_id, "freplace"); + target_opts.target_btf_id = btf_id; + + return bpf_program_attach_fd(prog, target_fd, "freplace", + &target_opts); } else { /* no target, so use raw_tracepoint_open for compatibility * with old kernels @@ -11900,6 +12591,48 @@ static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_l return libbpf_get_error(*link); } +struct bpf_link *bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct bpf_link *link; + int prog_fd, link_fd; + + if (!OPTS_VALID(opts, bpf_netfilter_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + + link->detach = &bpf_link__detach_fd; + + lopts.netfilter.pf = OPTS_GET(opts, pf, 0); + lopts.netfilter.hooknum = OPTS_GET(opts, hooknum, 0); + lopts.netfilter.priority = OPTS_GET(opts, priority, 0); + lopts.netfilter.flags = OPTS_GET(opts, flags, 0); + + link_fd = bpf_link_create(prog_fd, 0, BPF_NETFILTER, &lopts); + if (link_fd < 0) { + char errmsg[STRERR_BUFSIZE]; + + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to netfilter: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + + return link; +} + struct bpf_link *bpf_program__attach(const struct bpf_program *prog) { struct bpf_link *link = NULL; @@ -11923,22 +12656,30 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) return link; } +struct bpf_link_struct_ops { + struct bpf_link link; + int map_fd; +}; + static int bpf_link__detach_struct_ops(struct bpf_link *link) { + struct bpf_link_struct_ops *st_link; __u32 zero = 0; - if (bpf_map_delete_elem(link->fd, &zero)) - return -errno; + st_link = container_of(link, struct bpf_link_struct_ops, link); - return 0; + if (st_link->map_fd < 0) + /* w/o a real link */ + return bpf_map_delete_elem(link->fd, &zero); + + return close(link->fd); } struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) { - struct bpf_struct_ops *st_ops; - struct bpf_link *link; - __u32 i, zero = 0; - int err; + struct bpf_link_struct_ops *link; + __u32 zero = 0; + int err, fd; if (!bpf_map__is_struct_ops(map) || map->fd == -1) return libbpf_err_ptr(-EINVAL); @@ -11947,31 +12688,72 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) if (!link) return libbpf_err_ptr(-EINVAL); - st_ops = map->st_ops; - for (i = 0; i < btf_vlen(st_ops->type); i++) { - struct bpf_program *prog = st_ops->progs[i]; - void *kern_data; - int prog_fd; - - if (!prog) - continue; - - prog_fd = bpf_program__fd(prog); - kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; - *(unsigned long *)kern_data = prog_fd; - } - - err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0); - if (err) { - err = -errno; + /* kern_vdata should be prepared during the loading phase. */ + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && (!(map->def.map_flags & BPF_F_LINK) || err != -EBUSY)) { free(link); return libbpf_err_ptr(err); } - link->detach = bpf_link__detach_struct_ops; - link->fd = map->fd; + link->link.detach = bpf_link__detach_struct_ops; - return link; + if (!(map->def.map_flags & BPF_F_LINK)) { + /* w/o a real link */ + link->link.fd = map->fd; + link->map_fd = -1; + return &link->link; + } + + fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + if (fd < 0) { + free(link); + return libbpf_err_ptr(fd); + } + + link->link.fd = fd; + link->map_fd = map->fd; + + return &link->link; +} + +/* + * Swap the back struct_ops of a link with a new struct_ops map. + */ +int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) +{ + struct bpf_link_struct_ops *st_ops_link; + __u32 zero = 0; + int err; + + if (!bpf_map__is_struct_ops(map) || map->fd < 0) + return -EINVAL; + + st_ops_link = container_of(link, struct bpf_link_struct_ops, link); + /* Ensure the type of a link is correct */ + if (st_ops_link->map_fd < 0) + return -EINVAL; + + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && err != -EBUSY) + return err; + + err = bpf_link_update(link->fd, map->fd, NULL); + if (err < 0) + return err; + + st_ops_link->map_fd = map->fd; + + return 0; } typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, @@ -12167,17 +12949,22 @@ struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_buffer_params p = {}; struct perf_event_attr attr; + __u32 sample_period; if (!OPTS_VALID(opts, perf_buffer_opts)) return libbpf_err_ptr(-EINVAL); + sample_period = OPTS_GET(opts, sample_period, 1); + if (!sample_period) + sample_period = 1; + memset(&attr, 0, attr_sz); attr.size = attr_sz; attr.config = PERF_COUNT_SW_BPF_OUTPUT; attr.type = PERF_TYPE_SOFTWARE; attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; + attr.sample_period = sample_period; + attr.wakeup_events = sample_period; p.attr = &attr; p.sample_cb = sample_cb; @@ -12230,7 +13017,7 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, /* best-effort sanity checks */ memset(&map, 0, sizeof(map)); map_info_len = sizeof(map); - err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len); + err = bpf_map_get_info_by_fd(map_fd, &map, &map_info_len); if (err) { err = -errno; /* if BPF_OBJ_GET_INFO_BY_FD is supported, will return @@ -12846,7 +13633,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) for (i = 0; i < s->map_cnt; i++) { struct bpf_map *map = *s->maps[i].map; - size_t mmap_sz = bpf_map_mmap_sz(map); + size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); int prot, map_fd = bpf_map__fd(map); void **mmaped = s->maps[i].mmaped; @@ -12873,8 +13660,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) * as per normal clean up procedure, so we don't need to worry * about it from skeleton's clean up perspective. */ - *mmaped = mmap(map->mmaped, mmap_sz, prot, - MAP_SHARED | MAP_FIXED, map_fd, 0); + *mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map_fd, 0); if (*mmaped == MAP_FAILED) { err = -errno; *mmaped = NULL; diff --git a/src/libbpf.h b/src/libbpf.h index eee883f..6cd9c50 100644 --- a/src/libbpf.h +++ b/src/libbpf.h @@ -96,6 +96,14 @@ enum libbpf_print_level { typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, const char *, va_list ap); +/** + * @brief **libbpf_set_print()** sets user-provided log callback function to + * be used for libbpf warnings and informational messages. + * @param fn The log print function. If NULL, libbpf won't print anything. + * @return Pointer to old print function. + * + * This function is thread-safe. + */ LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); /* Hide internal to user */ @@ -174,6 +182,14 @@ struct bpf_object_open_opts { }; #define bpf_object_open_opts__last_field kernel_log_level +/** + * @brief **bpf_object__open()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path. + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_object *bpf_object__open(const char *path); /** @@ -203,16 +219,46 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); -/* Load/unload object into/from kernel */ +/** + * @brief **bpf_object__load()** loads BPF object into kernel. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** APIs + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ LIBBPF_API int bpf_object__load(struct bpf_object *obj); -LIBBPF_API void bpf_object__close(struct bpf_object *object); +/** + * @brief **bpf_object__close()** closes a BPF object and releases all + * resources. + * @param obj Pointer to a valid BPF object + */ +LIBBPF_API void bpf_object__close(struct bpf_object *obj); -/* pin_maps and unpin_maps can both be called with a NULL path, in which case - * they will use the pin_path attribute of each map (and ignore all maps that - * don't have a pin_path set). +/** + * @brief **bpf_object__pin_maps()** pins each map contained within + * the BPF object at the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where maps should be pinned. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__pin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. */ LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path); + +/** + * @brief **bpf_object__unpin_maps()** unpins each map contained within + * the BPF object found in the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where pinned maps should be searched for. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__unpin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. + */ LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj, const char *path); LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, @@ -220,6 +266,7 @@ LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, const char *path); LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); +LIBBPF_API int bpf_object__unpin(struct bpf_object *object, const char *path); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); @@ -403,12 +450,15 @@ LIBBPF_API struct bpf_link * bpf_program__attach(const struct bpf_program *prog); struct bpf_perf_event_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; + /* don't use BPF link when attach BPF program */ + bool force_ioctl_attach; + size_t :0; }; -#define bpf_perf_event_opts__last_field bpf_cookie +#define bpf_perf_event_opts__last_field force_ioctl_attach LIBBPF_API struct bpf_link * bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd); @@ -417,8 +467,25 @@ LIBBPF_API struct bpf_link * bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, const struct bpf_perf_event_opts *opts); +/** + * enum probe_attach_mode - the mode to attach kprobe/uprobe + * + * force libbpf to attach kprobe/uprobe in specific mode, -ENOTSUP will + * be returned if it is not supported by the kernel. + */ +enum probe_attach_mode { + /* attach probe in latest supported mode by kernel */ + PROBE_ATTACH_MODE_DEFAULT = 0, + /* attach probe in legacy mode, using debugfs/tracefs */ + PROBE_ATTACH_MODE_LEGACY, + /* create perf event with perf_event_open() syscall */ + PROBE_ATTACH_MODE_PERF, + /* attach probe with BPF link */ + PROBE_ATTACH_MODE_LINK, +}; + struct bpf_kprobe_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; @@ -426,9 +493,11 @@ struct bpf_kprobe_opts { size_t offset; /* kprobe is return probe */ bool retprobe; + /* kprobe attach mode */ + enum probe_attach_mode attach_mode; size_t :0; }; -#define bpf_kprobe_opts__last_field retprobe +#define bpf_kprobe_opts__last_field attach_mode LIBBPF_API struct bpf_link * bpf_program__attach_kprobe(const struct bpf_program *prog, bool retprobe, @@ -461,8 +530,59 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, const char *pattern, const struct bpf_kprobe_multi_opts *opts); +struct bpf_uprobe_multi_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* array of function symbols to attach to */ + const char **syms; + /* array of function addresses to attach to */ + const unsigned long *offsets; + /* optional, array of associated ref counter offsets */ + const unsigned long *ref_ctr_offsets; + /* optional, array of associated BPF cookies */ + const __u64 *cookies; + /* number of elements in syms/addrs/cookies arrays */ + size_t cnt; + /* create return uprobes */ + bool retprobe; + size_t :0; +}; + +#define bpf_uprobe_multi_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_uprobe_multi()** attaches a BPF program + * to multiple uprobes with uprobe_multi link. + * + * User can specify 2 mutually exclusive set of inputs: + * + * 1) use only path/func_pattern/pid arguments + * + * 2) use path/pid with allowed combinations of + * syms/offsets/ref_ctr_offsets/cookies/cnt + * + * - syms and offsets are mutually exclusive + * - ref_ctr_offsets and cookies are optional + * + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary + * @param func_pattern Regular expression to specify functions to attach + * BPF program to + * @param opts Additional options (see **struct bpf_uprobe_multi_opts**) + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_multi(const struct bpf_program *prog, + pid_t pid, + const char *binary_path, + const char *func_pattern, + const struct bpf_uprobe_multi_opts *opts); + struct bpf_ksyscall_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; @@ -508,7 +628,7 @@ bpf_program__attach_ksyscall(const struct bpf_program *prog, const struct bpf_ksyscall_opts *opts); struct bpf_uprobe_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* offset of kernel reference counted USDT semaphore, added in * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") @@ -526,9 +646,11 @@ struct bpf_uprobe_opts { * binary_path. */ const char *func_name; + /* uprobe attach mode */ + enum probe_attach_mode attach_mode; size_t :0; }; -#define bpf_uprobe_opts__last_field func_name +#define bpf_uprobe_opts__last_field attach_mode /** * @brief **bpf_program__attach_uprobe()** attaches a BPF program @@ -602,7 +724,7 @@ bpf_program__attach_usdt(const struct bpf_program *prog, const struct bpf_usdt_opts *opts); struct bpf_tracepoint_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; @@ -648,9 +770,55 @@ LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, int target_fd, const char *attach_func_name); +struct bpf_netfilter_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; +}; +#define bpf_netfilter_opts__last_field flags + +LIBBPF_API struct bpf_link * +bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts); + +struct bpf_tcx_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_tcx_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_program__attach_tcx(const struct bpf_program *prog, int ifindex, + const struct bpf_tcx_opts *opts); + +struct bpf_netkit_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_netkit_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_program__attach_netkit(const struct bpf_program *prog, int ifindex, + const struct bpf_netkit_opts *opts); + struct bpf_map; LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); +LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map); struct bpf_iter_attach_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -798,8 +966,22 @@ LIBBPF_API int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node); /* get/set map key size */ LIBBPF_API __u32 bpf_map__key_size(const struct bpf_map *map); LIBBPF_API int bpf_map__set_key_size(struct bpf_map *map, __u32 size); -/* get/set map value size */ +/* get map value size */ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); +/** + * @brief **bpf_map__set_value_size()** sets map value size. + * @param map the BPF map instance + * @return 0, on success; negative error, otherwise + * + * There is a special case for maps with associated memory-mapped regions, like + * the global data section maps (bss, data, rodata). When this function is used + * on such a map, the mapped region is resized. Afterward, an attempt is made to + * adjust the corresponding BTF info. This attempt is best-effort and can only + * succeed if the last variable of the data section map is an array. The array + * BTF type is replaced by a new BTF array type with a different length. + * Any previously existing pointers returned from bpf_map__initial_value() or + * corresponding data section skeleton pointer must be reinitialized. + */ LIBBPF_API int bpf_map__set_value_size(struct bpf_map *map, __u32 size); /* get map key/value BTF type IDs */ LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); @@ -813,7 +995,7 @@ LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); -LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); +LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); /** * @brief **bpf_map__is_internal()** tells the caller whether or not the @@ -823,10 +1005,57 @@ LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize * @return true, if the map is an internal map; false, otherwise */ LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); + +/** + * @brief **bpf_map__set_pin_path()** sets the path attribute that tells where the + * BPF map should be pinned. This does not actually create the 'pin'. + * @param map The bpf_map + * @param path The path + * @return 0, on success; negative error, otherwise + */ LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__pin_path()** gets the path attribute that tells where the + * BPF map should be pinned. + * @param map The bpf_map + * @return The path string; which can be NULL + */ LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map); + +/** + * @brief **bpf_map__is_pinned()** tells the caller whether or not the + * passed map has been pinned via a 'pin' file. + * @param map The bpf_map + * @return true, if the map is pinned; false, otherwise + */ LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map); + +/** + * @brief **bpf_map__pin()** creates a file that serves as a 'pin' + * for the BPF map. This increments the reference count on the + * BPF map which will keep the BPF map loaded even after the + * userspace process which loaded it has exited. + * @param map The bpf_map to pin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * If `path` is NULL the maps `pin_path` attribute will be used. If this is + * also NULL, an error will be returned and the map will not be pinned. + */ LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__unpin()** removes the file that serves as a + * 'pin' for the BPF map. + * @param map The bpf_map to unpin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * The `path` parameter can be NULL, in which case the `pin_path` + * map attribute is unpinned. If both the `path` parameter and + * `pin_path` map attribute are set, they must be equal. + */ LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); @@ -957,9 +1186,11 @@ struct bpf_xdp_query_opts { __u32 hw_prog_id; /* output */ __u32 skb_prog_id; /* output */ __u8 attach_mode; /* output */ + __u64 feature_flags; /* output */ + __u32 xdp_zc_max_segs; /* output */ size_t :0; }; -#define bpf_xdp_query_opts__last_field attach_mode +#define bpf_xdp_query_opts__last_field xdp_zc_max_segs LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts); @@ -1013,12 +1244,13 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, /* Ring buffer APIs */ struct ring_buffer; +struct ring; struct user_ring_buffer; typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); struct ring_buffer_opts { - size_t sz; /* size of this struct, for forward/backward compatiblity */ + size_t sz; /* size of this struct, for forward/backward compatibility */ }; #define ring_buffer_opts__last_field sz @@ -1033,13 +1265,86 @@ LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); +/** + * @brief **ring_buffer__ring()** returns the ringbuffer object inside a given + * ringbuffer manager representing a single BPF_MAP_TYPE_RINGBUF map instance. + * + * @param rb A ringbuffer manager object. + * @param idx An index into the ringbuffers contained within the ringbuffer + * manager object. The index is 0-based and corresponds to the order in which + * ring_buffer__add was called. + * @return A ringbuffer object on success; NULL and errno set if the index is + * invalid. + */ +LIBBPF_API struct ring *ring_buffer__ring(struct ring_buffer *rb, + unsigned int idx); + +/** + * @brief **ring__consumer_pos()** returns the current consumer position in the + * given ringbuffer. + * + * @param r A ringbuffer object. + * @return The current consumer position. + */ +LIBBPF_API unsigned long ring__consumer_pos(const struct ring *r); + +/** + * @brief **ring__producer_pos()** returns the current producer position in the + * given ringbuffer. + * + * @param r A ringbuffer object. + * @return The current producer position. + */ +LIBBPF_API unsigned long ring__producer_pos(const struct ring *r); + +/** + * @brief **ring__avail_data_size()** returns the number of bytes in the + * ringbuffer not yet consumed. This has no locking associated with it, so it + * can be inaccurate if operations are ongoing while this is called. However, it + * should still show the correct trend over the long-term. + * + * @param r A ringbuffer object. + * @return The number of bytes not yet consumed. + */ +LIBBPF_API size_t ring__avail_data_size(const struct ring *r); + +/** + * @brief **ring__size()** returns the total size of the ringbuffer's map data + * area (excluding special producer/consumer pages). Effectively this gives the + * amount of usable bytes of data inside the ringbuffer. + * + * @param r A ringbuffer object. + * @return The total size of the ringbuffer map data area. + */ +LIBBPF_API size_t ring__size(const struct ring *r); + +/** + * @brief **ring__map_fd()** returns the file descriptor underlying the given + * ringbuffer. + * + * @param r A ringbuffer object. + * @return The underlying ringbuffer file descriptor + */ +LIBBPF_API int ring__map_fd(const struct ring *r); + +/** + * @brief **ring__consume()** consumes available ringbuffer data without event + * polling. + * + * @param r A ringbuffer object. + * @return The number of records consumed (or INT_MAX, whichever is less), or + * a negative number if any of the callbacks return an error. + */ +LIBBPF_API int ring__consume(struct ring *r); + struct user_ring_buffer_opts { size_t sz; /* size of this struct, for forward/backward compatibility */ }; #define user_ring_buffer_opts__last_field sz -/* @brief **user_ring_buffer__new()** creates a new instance of a user ring +/** + * @brief **user_ring_buffer__new()** creates a new instance of a user ring * buffer. * * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. @@ -1050,7 +1355,8 @@ struct user_ring_buffer_opts { LIBBPF_API struct user_ring_buffer * user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); -/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the +/** + * @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the * user ring buffer. * @param rb A pointer to a user ring buffer. * @param size The size of the sample, in bytes. @@ -1070,7 +1376,8 @@ user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); */ LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); -/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the +/** + * @brief **user_ring_buffer__reserve_blocking()** reserves a record in the * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes * available. * @param rb The user ring buffer. @@ -1114,7 +1421,8 @@ LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms); -/* @brief **user_ring_buffer__submit()** submits a previously reserved sample +/** + * @brief **user_ring_buffer__submit()** submits a previously reserved sample * into the ring buffer. * @param rb The user ring buffer. * @param sample A reserved sample. @@ -1124,7 +1432,8 @@ LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, */ LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); -/* @brief **user_ring_buffer__discard()** discards a previously reserved sample. +/** + * @brief **user_ring_buffer__discard()** discards a previously reserved sample. * @param rb The user ring buffer. * @param sample A reserved sample. * @@ -1133,7 +1442,8 @@ LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *samp */ LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); -/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously +/** + * @brief **user_ring_buffer__free()** frees a ring buffer that was previously * created with **user_ring_buffer__new()**. * @param rb The user ring buffer being freed. */ @@ -1149,8 +1459,10 @@ typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); /* common use perf buffer options */ struct perf_buffer_opts { size_t sz; + __u32 sample_period; + size_t :0; }; -#define perf_buffer_opts__last_field sz +#define perf_buffer_opts__last_field sample_period /** * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified @@ -1375,7 +1687,7 @@ LIBBPF_API void bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s); struct gen_loader_opts { - size_t sz; /* size of this struct, for forward/backward compatiblity */ + size_t sz; /* size of this struct, for forward/backward compatibility */ const char *data; const char *insns; __u32 data_sz; @@ -1393,13 +1705,13 @@ enum libbpf_tristate { }; struct bpf_linker_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; }; #define bpf_linker_opts__last_field sz struct bpf_linker_file_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; }; #define bpf_linker_file_opts__last_field sz @@ -1442,7 +1754,7 @@ typedef int (*libbpf_prog_attach_fn_t)(const struct bpf_program *prog, long cook struct bpf_link **link); struct libbpf_prog_handler_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* User-provided value that is passed to prog_setup_fn, * prog_prepare_load_fn, and prog_attach_fn callbacks. Allows user to diff --git a/src/libbpf.map b/src/libbpf.map index 71bf569..b52dc28 100644 --- a/src/libbpf.map +++ b/src/libbpf.map @@ -382,3 +382,30 @@ LIBBPF_1.1.0 { user_ring_buffer__reserve_blocking; user_ring_buffer__submit; } LIBBPF_1.0.0; + +LIBBPF_1.2.0 { + global: + bpf_btf_get_info_by_fd; + bpf_link__update_map; + bpf_link_get_info_by_fd; + bpf_map_get_info_by_fd; + bpf_prog_get_info_by_fd; +} LIBBPF_1.1.0; + +LIBBPF_1.3.0 { + global: + bpf_obj_pin_opts; + bpf_object__unpin; + bpf_prog_detach_opts; + bpf_program__attach_netfilter; + bpf_program__attach_netkit; + bpf_program__attach_tcx; + bpf_program__attach_uprobe_multi; + ring__avail_data_size; + ring__consume; + ring__consumer_pos; + ring__map_fd; + ring__producer_pos; + ring__size; + ring_buffer__ring; +} LIBBPF_1.2.0; diff --git a/src/libbpf_common.h b/src/libbpf_common.h index 9a7937f..8fe248e 100644 --- a/src/libbpf_common.h +++ b/src/libbpf_common.h @@ -70,4 +70,23 @@ }; \ }) +/* Helper macro to clear and optionally reinitialize libbpf options struct + * + * Small helper macro to reset all fields and to reinitialize the common + * structure size member. Values provided by users in struct initializer- + * syntax as varargs can be provided as well to reinitialize options struct + * specific members. + */ +#define LIBBPF_OPTS_RESET(NAME, ...) \ + do { \ + typeof(NAME) ___##NAME = ({ \ + memset(&___##NAME, 0, sizeof(NAME)); \ + (typeof(NAME)) { \ + .sz = sizeof(NAME), \ + __VA_ARGS__ \ + }; \ + }); \ + memcpy(&NAME, &___##NAME, sizeof(NAME)); \ + } while (0) + #endif /* __LIBBPF_LIBBPF_COMMON_H */ diff --git a/src/libbpf_internal.h b/src/libbpf_internal.h index 377642f..8aaaf8f 100644 --- a/src/libbpf_internal.h +++ b/src/libbpf_internal.h @@ -15,6 +15,52 @@ #include #include #include + +#if defined HAVE_LIBELF +#include +#include +#endif // + +#ifdef HAVE_ELFIO +#include "elfio_c_wrapper.h" +#include +#include +#include +typedef struct Elf64_Ehdr Elf64_Ehdr; +typedef struct Elf64_Shdr Elf64_Shdr; +typedef struct Elf64_Sym Elf64_Sym; +typedef Elf64_Sym GElf_Sym; +typedef struct Elf64_Shdr GElf_Shdr; +typedef struct Elf64_Rel Elf64_Rel; +typedef Elf64_Half GElf_Versym ; +typedef struct { + void *d_buf; + size_t d_size; +} Elf_Data; + +typedef struct { + Elf64_Word vda_name; /* Version or dependency names */ + Elf64_Word vda_next; /* Offset in bytes to next verdaux entry */ +} GElf_Verdaux; + +typedef struct { + Elf64_Half vd_version; /* Version revision */ + Elf64_Half vd_flags; /* Version information */ + Elf64_Half vd_ndx; /* Version Index */ + Elf64_Half vd_cnt; /* Number of associated aux entries */ + Elf64_Word vd_hash; /* Version name hash value */ + Elf64_Word vd_aux; /* Offset in bytes to verdaux array */ + Elf64_Word vd_next; /* Offset in bytes to next verdef entry */ +} GElf_Verdef; + +#define ELF64_ST_TYPE(val) ELF_ST_TYPE (val) +#define ELF64_ST_BIND(val) ELF_ST_BIND (val) +#define GELF_ST_BIND(val) ELF64_ST_BIND (val) +#define elf_errmsg(val) "error" +#define SHT_GNU_versym 0x6fffffff +#define SHT_GNU_verdef 0x6ffffffd +#endif + #include "relo_core.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -354,6 +400,8 @@ enum kern_feature_id { FEAT_BTF_ENUM64, /* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */ FEAT_SYSCALL_WRAPPER, + /* BPF multi-uprobe link support */ + FEAT_UPROBE_MULTI_LINK, __FEAT_CNT, }; @@ -469,6 +517,33 @@ struct bpf_line_info_min { __u32 line_col; }; +struct elf_sym { + const char *name; + GElf_Sym sym; + GElf_Shdr sh; + int ver; + bool hidden; +}; + +struct elf_sym_iter { +#ifdef HAVE_LIBELF + Elf *elf; +#elif HAVE_ELFIO + pelfio_t elf; + psection_t symsSec; +#endif + Elf_Data *syms; + Elf_Data *versyms; + Elf_Data *verdefs; + size_t nr_syms; + size_t strtabidx; + size_t verdef_strtabidx; + size_t next_sym_idx; + struct elf_sym sym; + int st_type; +}; + + typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); @@ -543,6 +618,7 @@ static inline int ensure_good_fd(int fd) fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); saved_errno = errno; close(old_fd); + errno = saved_errno; if (fd < 0) { pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); errno = saved_errno; @@ -576,4 +652,30 @@ static inline bool is_pow_of_2(size_t x) #define PROG_LOAD_ATTEMPTS 5 int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts); +bool glob_match(const char *str, const char *pat); + +#ifdef HAVE_LIBELF +long elf_find_func_offset(Elf *elf, const char *binary_path, const char *name); +#elif HAVE_ELFIO +long elf_find_func_offset(pelfio_t elf, const char *binary_path, const char *name); +#endif +long elf_find_func_offset_from_file(const char *binary_path, const char *name); + +struct elf_fd { +#ifdef HAVE_LIBELF + Elf *elf; +#elif HAVE_ELFIO + pelfio_t elf; +#endif + int fd; +}; + +int elf_open(const char *binary_path, struct elf_fd *elf_fd); +void elf_close(struct elf_fd *elf_fd); + +int elf_resolve_syms_offsets(const char *binary_path, int cnt, + const char **syms, unsigned long **poffsets); +int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, + unsigned long **poffsets, size_t *pcnt); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/src/libbpf_probes.c b/src/libbpf_probes.c index b44fcbb..9c4db90 100644 --- a/src/libbpf_probes.c +++ b/src/libbpf_probes.c @@ -12,11 +12,94 @@ #include #include #include +#include #include "bpf.h" #include "libbpf.h" #include "libbpf_internal.h" +/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, + * but Ubuntu provides /proc/version_signature file, as described at + * https://ubuntu.com/kernel, with an example contents below, which we + * can use to get a proper LINUX_VERSION_CODE. + * + * Ubuntu 5.4.0-12.15-generic 5.4.8 + * + * In the above, 5.4.8 is what kernel is actually expecting, while + * uname() call will return 5.4.0 in info.release. + */ +static __u32 get_ubuntu_kernel_version(void) +{ + const char *ubuntu_kver_file = "/proc/version_signature"; + __u32 major, minor, patch; + int ret; + FILE *f; + + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) != 0) + return 0; + + f = fopen(ubuntu_kver_file, "re"); + if (!f) + return 0; + + ret = fscanf(f, "%*s %*s %u.%u.%u\n", &major, &minor, &patch); + fclose(f); + if (ret != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +/* On Debian LINUX_VERSION_CODE doesn't correspond to info.release. + * Instead, it is provided in info.version. An example content of + * Debian 10 looks like the below. + * + * utsname::release 4.19.0-22-amd64 + * utsname::version #1 SMP Debian 4.19.260-1 (2022-09-29) + * + * In the above, 4.19.260 is what kernel is actually expecting, while + * uname() call will return 4.19.0 in info.release. + */ +static __u32 get_debian_kernel_version(struct utsname *info) +{ + __u32 major, minor, patch; + char *p; + + p = strstr(info->version, "Debian "); + if (!p) { + /* This is not a Debian kernel. */ + return 0; + } + + if (sscanf(p, "Debian %u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +__u32 get_kernel_version(void) +{ + __u32 major, minor, patch, version; + struct utsname info; + + /* Check if this is an Ubuntu kernel. */ + version = get_ubuntu_kernel_version(); + if (version != 0) + return version; + + uname(&info); + + /* Check if this is a Debian kernel. */ + version = get_debian_kernel_version(&info); + if (version != 0) + return version; + + if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + static int probe_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insns_cnt, char *log_buf, size_t log_buf_sz) @@ -98,6 +181,9 @@ static int probe_prog_load(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_SYSCTL: break; + case BPF_PROG_TYPE_NETFILTER: + opts.expected_attach_type = BPF_NETFILTER; + break; default: return -EOPNOTSUPP; } diff --git a/src/libbpf_version.h b/src/libbpf_version.h index e944f5b..290411d 100644 --- a/src/libbpf_version.h +++ b/src/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 1 +#define LIBBPF_MINOR_VERSION 3 #endif /* __LIBBPF_VERSION_H */ diff --git a/src/linker.c b/src/linker.c index 4ac02c2..5ced96d 100644 --- a/src/linker.c +++ b/src/linker.c @@ -1115,7 +1115,19 @@ static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src if (src->shdr->sh_type != SHT_NOBITS) { tmp = realloc(dst->raw_data, dst_final_sz); - if (!tmp) + /* If dst_align_sz == 0, realloc() behaves in a special way: + * 1. When dst->raw_data is NULL it returns: + * "either NULL or a pointer suitable to be passed to free()" [1]. + * 2. When dst->raw_data is not-NULL it frees dst->raw_data and returns NULL, + * thus invalidating any "pointer suitable to be passed to free()" obtained + * at step (1). + * + * The dst_align_sz > 0 check avoids error exit after (2), otherwise + * dst->raw_data would be freed again in bpf_linker__free(). + * + * [1] man 3 realloc + */ + if (!tmp && dst_align_sz > 0) return -ENOMEM; dst->raw_data = tmp; @@ -1997,7 +2009,6 @@ add_sym: static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) { struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; - struct dst_sec *dst_symtab; int i, err; for (i = 1; i < obj->sec_cnt; i++) { @@ -2030,9 +2041,6 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return -1; } - /* add_dst_sec() above could have invalidated linker->secs */ - dst_symtab = &linker->secs[linker->symtab_sec_idx]; - /* shdr->sh_link points to SYMTAB */ dst_sec->shdr->sh_link = linker->symtab_sec_idx; @@ -2049,16 +2057,13 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob dst_rel = dst_sec->raw_data + src_sec->dst_off; n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; for (j = 0; j < n; j++, src_rel++, dst_rel++) { - size_t src_sym_idx = ELF64_R_SYM(src_rel->r_info); - size_t sym_type = ELF64_R_TYPE(src_rel->r_info); - Elf64_Sym *src_sym, *dst_sym; - size_t dst_sym_idx; + size_t src_sym_idx, dst_sym_idx, sym_type; + Elf64_Sym *src_sym; src_sym_idx = ELF64_R_SYM(src_rel->r_info); src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; dst_sym_idx = obj->sym_map[src_sym_idx]; - dst_sym = dst_symtab->raw_data + sizeof(*dst_sym) * dst_sym_idx; dst_rel->r_offset += src_linked_sec->dst_off; sym_type = ELF64_R_TYPE(src_rel->r_info); dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); diff --git a/src/netlink.c b/src/netlink.c index 3510458..090bcf6 100644 --- a/src/netlink.c +++ b/src/netlink.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -39,9 +40,16 @@ struct xdp_id_md { int ifindex; __u32 flags; struct xdp_link_info info; + __u64 feature_flags; }; -static int libbpf_netlink_open(__u32 *nl_pid) +struct xdp_features_md { + int ifindex; + __u32 xdp_zc_max_segs; + __u64 flags; +}; + +static int libbpf_netlink_open(__u32 *nl_pid, int proto) { struct sockaddr_nl sa; socklen_t addrlen; @@ -51,7 +59,7 @@ static int libbpf_netlink_open(__u32 *nl_pid) memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; - sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto); if (sock < 0) return -errno; @@ -212,14 +220,14 @@ done: } static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, - __dump_nlmsg_t parse_msg, + int proto, __dump_nlmsg_t parse_msg, libbpf_dump_nlmsg_t parse_attr, void *cookie) { __u32 nl_pid = 0; int sock, ret; - sock = libbpf_netlink_open(&nl_pid); + sock = libbpf_netlink_open(&nl_pid, proto); if (sock < 0) return sock; @@ -238,6 +246,43 @@ out: return ret; } +static int parse_genl_family_id(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[CTRL_ATTR_FAMILY_ID + 1]; + __u16 *id = cookie; + + libbpf_nla_parse(tb, CTRL_ATTR_FAMILY_ID, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + if (!tb[CTRL_ATTR_FAMILY_ID]) + return NL_CONT; + + *id = libbpf_nla_getattr_u16(tb[CTRL_ATTR_FAMILY_ID]); + return NL_DONE; +} + +static int libbpf_netlink_resolve_genl_family_id(const char *name, + __u16 len, __u16 *id) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN), + .nh.nlmsg_type = GENL_ID_CTRL, + .nh.nlmsg_flags = NLM_F_REQUEST, + .gnl.cmd = CTRL_CMD_GETFAMILY, + .gnl.version = 2, + }; + int err; + + err = nlattr_add(&req, CTRL_ATTR_FAMILY_NAME, name, len); + if (err < 0) + return err; + + return libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_genl_family_id, NULL, id); +} + static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, __u32 flags) { @@ -271,7 +316,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, } nlattr_end_nested(&req, nla); - return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); } int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts) @@ -357,6 +402,32 @@ static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb) return 0; } +static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[NETDEV_CMD_MAX + 1]; + struct xdp_features_md *md = cookie; + __u32 ifindex; + + libbpf_nla_parse(tb, NETDEV_CMD_MAX, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + + if (!tb[NETDEV_A_DEV_IFINDEX] || !tb[NETDEV_A_DEV_XDP_FEATURES]) + return NL_CONT; + + ifindex = libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_IFINDEX]); + if (ifindex != md->ifindex) + return NL_CONT; + + md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]); + if (tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]) + md->xdp_zc_max_segs = + libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]); + return NL_DONE; +} + int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) { struct libbpf_nla_req req = { @@ -366,6 +437,10 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) .ifinfo.ifi_family = AF_PACKET, }; struct xdp_id_md xdp_id = {}; + struct xdp_features_md md = { + .ifindex = ifindex, + }; + __u16 id; int err; if (!OPTS_VALID(opts, bpf_xdp_query_opts)) @@ -382,7 +457,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) xdp_id.ifindex = ifindex; xdp_id.flags = xdp_flags; - err = libbpf_netlink_send_recv(&req, __dump_link_nlmsg, + err = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, __dump_link_nlmsg, get_xdp_info, &xdp_id); if (err) return libbpf_err(err); @@ -393,6 +468,38 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id); OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode); + if (!OPTS_HAS(opts, feature_flags)) + return 0; + + err = libbpf_netlink_resolve_genl_family_id("netdev", sizeof("netdev"), &id); + if (err < 0) { + if (err == -ENOENT) { + opts->feature_flags = 0; + goto skip_feature_flags; + } + return libbpf_err(err); + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = id; + req.gnl.cmd = NETDEV_CMD_DEV_GET; + req.gnl.version = 2; + + err = nlattr_add(&req, NETDEV_A_DEV_IFINDEX, &ifindex, sizeof(ifindex)); + if (err < 0) + return libbpf_err(err); + + err = libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_xdp_features, NULL, &md); + if (err) + return libbpf_err(err); + + opts->feature_flags = md.flags; + opts->xdp_zc_max_segs = md.xdp_zc_max_segs; + +skip_feature_flags: return 0; } @@ -493,7 +600,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) if (ret < 0) return ret; - return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); } static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) @@ -593,7 +700,7 @@ static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) int len, ret; memset(&info, 0, info_len); - ret = bpf_obj_get_info_by_fd(fd, &info, &info_len); + ret = bpf_prog_get_info_by_fd(fd, &info, &info_len); if (ret < 0) return ret; @@ -673,7 +780,8 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) info.opts = opts; - ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) @@ -739,7 +847,7 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook, return ret; } - return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); } int bpf_tc_detach(const struct bpf_tc_hook *hook, @@ -804,7 +912,8 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) info.opts = opts; - ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) diff --git a/src/nlattr.c b/src/nlattr.c index 3900d05..975e265 100644 --- a/src/nlattr.c +++ b/src/nlattr.c @@ -178,7 +178,7 @@ int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh) hlen += nlmsg_len(&err->msg); attr = (struct nlattr *) ((void *) err + hlen); - alen = nlh->nlmsg_len - hlen; + alen = (void *)nlh + nlh->nlmsg_len - (void *)attr; if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) { diff --git a/src/nlattr.h b/src/nlattr.h index 4d15ae2..d92d1c1 100644 --- a/src/nlattr.h +++ b/src/nlattr.h @@ -14,6 +14,7 @@ #include #include #include +#include /* avoid multiple definition of netlink features */ #define __LINUX_NETLINK_H @@ -58,6 +59,7 @@ struct libbpf_nla_req { union { struct ifinfomsg ifinfo; struct tcmsg tc; + struct genlmsghdr gnl; }; char buf[128]; }; @@ -89,11 +91,21 @@ static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) return *(uint8_t *)libbpf_nla_data(nla); } +static inline uint16_t libbpf_nla_getattr_u16(const struct nlattr *nla) +{ + return *(uint16_t *)libbpf_nla_data(nla); +} + static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla) { return *(uint32_t *)libbpf_nla_data(nla); } +static inline uint64_t libbpf_nla_getattr_u64(const struct nlattr *nla) +{ + return *(uint64_t *)libbpf_nla_data(nla); +} + static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla) { return (const char *)libbpf_nla_data(nla); diff --git a/src/relo_core.c b/src/relo_core.c index c4b0e81..63a4d5a 100644 --- a/src/relo_core.c +++ b/src/relo_core.c @@ -776,7 +776,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, break; case BPF_CORE_FIELD_SIGNED: *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || - (btf_int_encoding(mt) & BTF_INT_SIGNED); + (btf_is_int(mt) && (btf_int_encoding(mt) & BTF_INT_SIGNED)); if (validate) *validate = true; /* signedness is never ambiguous */ break; @@ -1551,9 +1551,6 @@ int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const st if (level <= 0) return -EINVAL; - local_t = btf_type_by_id(local_btf, local_id); - targ_t = btf_type_by_id(targ_btf, targ_id); - recur: depth--; if (depth < 0) diff --git a/src/ringbuf.c b/src/ringbuf.c index 47855af..aacb642 100644 --- a/src/ringbuf.c +++ b/src/ringbuf.c @@ -34,7 +34,7 @@ struct ring { struct ring_buffer { struct epoll_event *events; - struct ring *rings; + struct ring **rings; size_t page_size; int epoll_fd; int ring_cnt; @@ -57,7 +57,7 @@ struct ringbuf_hdr { __u32 pad; }; -static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +static void ringbuf_free_ring(struct ring_buffer *rb, struct ring *r) { if (r->consumer_pos) { munmap(r->consumer_pos, rb->page_size); @@ -67,6 +67,8 @@ static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); r->producer_pos = NULL; } + + free(r); } /* Add extra RINGBUF maps to this ring buffer manager */ @@ -83,7 +85,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, memset(&info, 0, sizeof(info)); - err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + err = bpf_map_get_info_by_fd(map_fd, &info, &len); if (err) { err = -errno; pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", @@ -107,8 +109,10 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, return libbpf_err(-ENOMEM); rb->events = tmp; - r = &rb->rings[rb->ring_cnt]; - memset(r, 0, sizeof(*r)); + r = calloc(1, sizeof(*r)); + if (!r) + return libbpf_err(-ENOMEM); + rb->rings[rb->ring_cnt] = r; r->map_fd = map_fd; r->sample_cb = sample_cb; @@ -121,7 +125,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, err = -errno; pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", map_fd, err); - return libbpf_err(err); + goto err_out; } r->consumer_pos = tmp; @@ -131,16 +135,16 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, */ mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; if (mmap_sz != (__u64)(size_t)mmap_sz) { + err = -E2BIG; pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries); - return libbpf_err(-E2BIG); + goto err_out; } tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size); if (tmp == MAP_FAILED) { err = -errno; - ringbuf_unmap_ring(rb, r); pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", map_fd, err); - return libbpf_err(err); + goto err_out; } r->producer_pos = tmp; r->data = tmp + rb->page_size; @@ -152,14 +156,17 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, e->data.fd = rb->ring_cnt; if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { err = -errno; - ringbuf_unmap_ring(rb, r); pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); - return libbpf_err(err); + goto err_out; } rb->ring_cnt++; return 0; + +err_out: + ringbuf_free_ring(rb, r); + return libbpf_err(err); } void ring_buffer__free(struct ring_buffer *rb) @@ -170,7 +177,7 @@ void ring_buffer__free(struct ring_buffer *rb) return; for (i = 0; i < rb->ring_cnt; ++i) - ringbuf_unmap_ring(rb, &rb->rings[i]); + ringbuf_free_ring(rb, rb->rings[i]); if (rb->epoll_fd >= 0) close(rb->epoll_fd); @@ -278,7 +285,7 @@ int ring_buffer__consume(struct ring_buffer *rb) int i; for (i = 0; i < rb->ring_cnt; i++) { - struct ring *ring = &rb->rings[i]; + struct ring *ring = rb->rings[i]; err = ringbuf_process_ring(ring); if (err < 0) @@ -305,7 +312,7 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) for (i = 0; i < cnt; i++) { __u32 ring_id = rb->events[i].data.fd; - struct ring *ring = &rb->rings[ring_id]; + struct ring *ring = rb->rings[ring_id]; err = ringbuf_process_ring(ring); if (err < 0) @@ -323,6 +330,58 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb) return rb->epoll_fd; } +struct ring *ring_buffer__ring(struct ring_buffer *rb, unsigned int idx) +{ + if (idx >= rb->ring_cnt) + return errno = ERANGE, NULL; + + return rb->rings[idx]; +} + +unsigned long ring__consumer_pos(const struct ring *r) +{ + /* Synchronizes with smp_store_release() in ringbuf_process_ring(). */ + return smp_load_acquire(r->consumer_pos); +} + +unsigned long ring__producer_pos(const struct ring *r) +{ + /* Synchronizes with smp_store_release() in __bpf_ringbuf_reserve() in + * the kernel. + */ + return smp_load_acquire(r->producer_pos); +} + +size_t ring__avail_data_size(const struct ring *r) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = ring__consumer_pos(r); + prod_pos = ring__producer_pos(r); + return prod_pos - cons_pos; +} + +size_t ring__size(const struct ring *r) +{ + return r->mask + 1; +} + +int ring__map_fd(const struct ring *r) +{ + return r->map_fd; +} + +int ring__consume(struct ring *r) +{ + int64_t res; + + res = ringbuf_process_ring(r); + if (res < 0) + return libbpf_err(res); + + return res > INT_MAX ? INT_MAX : res; +} + static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) { if (rb->consumer_pos) { @@ -359,7 +418,7 @@ static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) memset(&info, 0, sizeof(info)); - err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + err = bpf_map_get_info_by_fd(map_fd, &info, &len); if (err) { err = -errno; pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); diff --git a/src/usdt.bpf.h b/src/usdt.bpf.h index fdfd235..f676330 100644 --- a/src/usdt.bpf.h +++ b/src/usdt.bpf.h @@ -4,8 +4,8 @@ #define __USDT_BPF_H__ #include -#include -#include +#include "bpf_helpers.h" +#include "bpf_tracing.h" /* Below types and maps are internal implementation details of libbpf's USDT * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should @@ -130,7 +130,10 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) if (!spec) return -ESRCH; - if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) + if (arg_num >= BPF_USDT_MAX_ARG_CNT) + return -ENOENT; + barrier_var(arg_num); + if (arg_num >= spec->arg_cnt) return -ENOENT; arg_spec = &spec->args[arg_num]; diff --git a/src/usdt.c b/src/usdt.c index 75b411f..93794f0 100644 --- a/src/usdt.c +++ b/src/usdt.c @@ -250,6 +250,7 @@ struct usdt_manager { bool has_bpf_cookie; bool has_sema_refcnt; + bool has_uprobe_multi; }; struct usdt_manager *usdt_manager_new(struct bpf_object *obj) @@ -284,6 +285,11 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj) */ man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; + /* + * Detect kernel support for uprobe multi link to be used for attaching + * usdt probes. + */ + man->has_uprobe_multi = kernel_supports(obj, FEAT_UPROBE_MULTI_LINK); return man; } @@ -466,7 +472,7 @@ static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, proceed: sprintf(line, "/proc/%d/maps", pid); - f = fopen(line, "r"); + f = fopen(line, "re"); if (!f) { err = -errno; pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", @@ -771,7 +777,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * target->rel_ip = usdt_rel_ip; target->sema_off = usdt_sema_off; - /* notes.args references strings from Elf itself, so they can + /* notes.args references strings from ELF itself, so they can * be referenced safely until elf_end() call */ target->spec_str = note.args; @@ -808,6 +814,8 @@ struct bpf_link_usdt { long abs_ip; struct bpf_link *link; } *uprobes; + + struct bpf_link *multi_link; }; static int bpf_link_usdt_detach(struct bpf_link *link) @@ -816,6 +824,9 @@ static int bpf_link_usdt_detach(struct bpf_link *link) struct usdt_manager *man = usdt_link->usdt_man; int i; + bpf_link__destroy(usdt_link->multi_link); + + /* When having multi_link, uprobe_cnt is 0 */ for (i = 0; i < usdt_link->uprobe_cnt; i++) { /* detach underlying uprobe link */ bpf_link__destroy(usdt_link->uprobes[i].link); @@ -852,8 +863,11 @@ static int bpf_link_usdt_detach(struct bpf_link *link) * system is so exhausted on memory, it's the least of user's * concerns, probably. * So just do our best here to return those IDs to usdt_manager. + * Another edge case when we can legitimately get NULL is when + * new_cnt is zero, which can happen in some edge cases, so we + * need to be careful about that. */ - if (new_free_ids) { + if (new_free_ids || new_cnt == 0) { memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); man->free_spec_ids = new_free_ids; @@ -943,33 +957,24 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie) { - int i, fd, err, spec_map_fd, ip_map_fd; + unsigned long *offsets = NULL, *ref_ctr_offsets = NULL; + int i, err, spec_map_fd, ip_map_fd; LIBBPF_OPTS(bpf_uprobe_opts, opts); struct hashmap *specs_hash = NULL; struct bpf_link_usdt *link = NULL; struct usdt_target *targets = NULL; + __u64 *cookies = NULL; + struct elf_fd elf_fd; size_t target_cnt; - Elf *elf; spec_map_fd = bpf_map__fd(man->specs_map); ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); - /* TODO: perform path resolution similar to uprobe's */ - fd = open(path, O_RDONLY); - if (fd < 0) { - err = -errno; - pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); + err = elf_open(path, &elf_fd); + if (err) return libbpf_err_ptr(err); - } - elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); - if (!elf) { - err = -EBADF; - pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); - goto err_out; - } - - err = sanity_check_usdt_elf(elf, path); + err = sanity_check_usdt_elf(elf_fd.elf, path); if (err) goto err_out; @@ -982,7 +987,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct /* discover USDT in given binary, optionally limiting * activations to a given PID, if pid > 0 */ - err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, + err = collect_usdt_targets(man, elf_fd.elf, path, pid, usdt_provider, usdt_name, usdt_cookie, &targets, &target_cnt); if (err <= 0) { err = (err == 0) ? -ENOENT : err; @@ -1005,10 +1010,21 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct link->link.detach = &bpf_link_usdt_detach; link->link.dealloc = &bpf_link_usdt_dealloc; - link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); - if (!link->uprobes) { - err = -ENOMEM; - goto err_out; + if (man->has_uprobe_multi) { + offsets = calloc(target_cnt, sizeof(*offsets)); + cookies = calloc(target_cnt, sizeof(*cookies)); + ref_ctr_offsets = calloc(target_cnt, sizeof(*ref_ctr_offsets)); + + if (!offsets || !ref_ctr_offsets || !cookies) { + err = -ENOMEM; + goto err_out; + } + } else { + link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); + if (!link->uprobes) { + err = -ENOMEM; + goto err_out; + } } for (i = 0; i < target_cnt; i++) { @@ -1049,37 +1065,65 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct goto err_out; } - opts.ref_ctr_offset = target->sema_off; - opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; - uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, - target->rel_ip, &opts); - err = libbpf_get_error(uprobe_link); - if (err) { - pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", - i, usdt_provider, usdt_name, path, err); + if (man->has_uprobe_multi) { + offsets[i] = target->rel_ip; + ref_ctr_offsets[i] = target->sema_off; + cookies[i] = spec_id; + } else { + opts.ref_ctr_offset = target->sema_off; + opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; + uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, + target->rel_ip, &opts); + err = libbpf_get_error(uprobe_link); + if (err) { + pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", + i, usdt_provider, usdt_name, path, err); + goto err_out; + } + + link->uprobes[i].link = uprobe_link; + link->uprobes[i].abs_ip = target->abs_ip; + link->uprobe_cnt++; + } + } + + if (man->has_uprobe_multi) { + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts_multi, + .ref_ctr_offsets = ref_ctr_offsets, + .offsets = offsets, + .cookies = cookies, + .cnt = target_cnt, + ); + + link->multi_link = bpf_program__attach_uprobe_multi(prog, pid, path, + NULL, &opts_multi); + if (!link->multi_link) { + err = -errno; + pr_warn("usdt: failed to attach uprobe multi for '%s:%s' in '%s': %d\n", + usdt_provider, usdt_name, path, err); goto err_out; } - link->uprobes[i].link = uprobe_link; - link->uprobes[i].abs_ip = target->abs_ip; - link->uprobe_cnt++; + free(offsets); + free(ref_ctr_offsets); + free(cookies); } free(targets); hashmap__free(specs_hash); - elf_end(elf); - close(fd); - + elf_close(&elf_fd); return &link->link; err_out: + free(offsets); + free(ref_ctr_offsets); + free(cookies); + if (link) bpf_link__destroy(&link->link); free(targets); hashmap__free(specs_hash); - if (elf) - elf_end(elf); - close(fd); + elf_close(&elf_fd); return libbpf_err_ptr(err); } @@ -1141,12 +1185,13 @@ static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, return 0; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg); +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie) { + struct usdt_arg_spec *arg; const char *s; - int len; + int arg_sz, len; spec->usdt_cookie = usdt_cookie; spec->arg_cnt = 0; @@ -1159,10 +1204,25 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, return -E2BIG; } - len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]); + arg = &spec->args[spec->arg_cnt]; + len = parse_usdt_arg(s, spec->arg_cnt, arg, &arg_sz); if (len < 0) return len; + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + spec->arg_cnt, s, arg_sz); + return -EINVAL; + } + s += len; spec->arg_cnt++; } @@ -1219,13 +1279,13 @@ static int calc_pt_regs_off(const char *reg_name) return -ENOENT; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { char reg_name[16]; - int arg_sz, len, reg_off; + int len, reg_off; long off; - if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", &arg_sz, &off, reg_name, &len) == 3) { + if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -4@-20(%rbp) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1233,7 +1293,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", &arg_sz, reg_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", arg_sz, reg_name, &len) == 2) { /* Memory dereference case without offset, e.g., 8@(%rsp) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = 0; @@ -1241,7 +1301,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ %%%15s %n", &arg_sz, reg_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -4@%eax */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; @@ -1250,7 +1310,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ $%ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ $%ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@$71 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; @@ -1260,20 +1320,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; - - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); - return -EINVAL; - } - return len; } @@ -1281,13 +1327,13 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec /* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { unsigned int reg; - int arg_sz, len; + int len; long off; - if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", &arg_sz, &off, ®, &len) == 3) { + if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", arg_sz, &off, ®, &len) == 3) { /* Memory dereference case, e.g., -2@-28(%r15) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1296,7 +1342,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } arg->reg_off = offsetof(user_pt_regs, gprs[reg]); - } else if (sscanf(arg_str, " %d @ %%r%u %n", &arg_sz, ®, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %%r%u %n", arg_sz, ®, &len) == 2) { /* Register read case, e.g., -8@%r0 */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; @@ -1305,7 +1351,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } arg->reg_off = offsetof(user_pt_regs, gprs[reg]); - } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@71 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; @@ -1315,20 +1361,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; - - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); - return -EINVAL; - } - return len; } @@ -1348,13 +1380,13 @@ static int calc_pt_regs_off(const char *reg_name) return -ENOENT; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { char reg_name[16]; - int arg_sz, len, reg_off; + int len, reg_off; long off; - if (sscanf(arg_str, " %d @ \[ %15[a-z0-9], %ld ] %n", &arg_sz, reg_name, &off, &len) == 3) { + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , %ld ] %n", arg_sz, reg_name, &off, &len) == 3) { /* Memory dereference case, e.g., -4@[sp, 96] */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1362,7 +1394,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", &arg_sz, reg_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { /* Memory dereference case, e.g., -4@[sp] */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = 0; @@ -1370,12 +1402,12 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@5 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; arg->reg_off = 0; - } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", &arg_sz, reg_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -8@x4 */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; @@ -1388,20 +1420,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; - - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); - return -EINVAL; - } - return len; } @@ -1456,13 +1474,13 @@ static int calc_pt_regs_off(const char *reg_name) return -ENOENT; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { char reg_name[16]; - int arg_sz, len, reg_off; + int len, reg_off; long off; - if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", &arg_sz, &off, reg_name, &len) == 3) { + if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -8@-88(s0) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1470,12 +1488,12 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@5 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; arg->reg_off = 0; - } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", &arg_sz, reg_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -8@a1 */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; @@ -1488,17 +1506,83 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; + return len; +} - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); +#elif defined(__arm__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "r0", offsetof(struct pt_regs, uregs[0]) }, + { "r1", offsetof(struct pt_regs, uregs[1]) }, + { "r2", offsetof(struct pt_regs, uregs[2]) }, + { "r3", offsetof(struct pt_regs, uregs[3]) }, + { "r4", offsetof(struct pt_regs, uregs[4]) }, + { "r5", offsetof(struct pt_regs, uregs[5]) }, + { "r6", offsetof(struct pt_regs, uregs[6]) }, + { "r7", offsetof(struct pt_regs, uregs[7]) }, + { "r8", offsetof(struct pt_regs, uregs[8]) }, + { "r9", offsetof(struct pt_regs, uregs[9]) }, + { "r10", offsetof(struct pt_regs, uregs[10]) }, + { "fp", offsetof(struct pt_regs, uregs[11]) }, + { "ip", offsetof(struct pt_regs, uregs[12]) }, + { "sp", offsetof(struct pt_regs, uregs[13]) }, + { "lr", offsetof(struct pt_regs, uregs[14]) }, + { "pc", offsetof(struct pt_regs, uregs[15]) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , #%ld ] %n", + arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[fp, #96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ #%ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@#5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@r4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); return -EINVAL; } @@ -1507,7 +1591,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec #else -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); return -ENOTSUP; diff --git a/src/zip.c b/src/zip.c new file mode 100644 index 0000000..3f26d62 --- /dev/null +++ b/src/zip.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Routines for dealing with .zip archives. + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf_internal.h" +#include "zip.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" + +/* Specification of ZIP file format can be found here: + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + * For a high level overview of the structure of a ZIP file see + * sections 4.3.1 - 4.3.6. + * + * Data structures appearing in ZIP files do not contain any + * padding and they might be misaligned. To allow us to safely + * operate on pointers to such structures and their members, we + * declare the types as packed. + */ + +#define END_OF_CD_RECORD_MAGIC 0x06054b50 + +/* See section 4.3.16 of the spec. */ +struct end_of_cd_record { + /* Magic value equal to END_OF_CD_RECORD_MAGIC */ + __u32 magic; + + /* Number of the file containing this structure or 0xFFFF if ZIP64 archive. + * Zip archive might span multiple files (disks). + */ + __u16 this_disk; + + /* Number of the file containing the beginning of the central directory or + * 0xFFFF if ZIP64 archive. + */ + __u16 cd_disk; + + /* Number of central directory records on this disk or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records; + + /* Number of central directory records on all disks or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records_total; + + /* Size of the central directory record or 0xFFFFFFFF if ZIP64 archive. */ + __u32 cd_size; + + /* Offset of the central directory from the beginning of the archive or + * 0xFFFFFFFF if ZIP64 archive. + */ + __u32 cd_offset; + + /* Length of comment data following end of central directory record. */ + __u16 comment_length; + + /* Up to 64k of arbitrary bytes. */ + /* uint8_t comment[comment_length] */ +} __attribute__((packed)); + +#define CD_FILE_HEADER_MAGIC 0x02014b50 +#define FLAG_ENCRYPTED (1 << 0) +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3) + +/* See section 4.3.12 of the spec. */ +struct cd_file_header { + /* Magic value equal to CD_FILE_HEADER_MAGIC. */ + __u32 magic; + __u16 version; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; + __u16 file_comment_length; + /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */ + __u16 disk; + __u16 internal_attributes; + __u32 external_attributes; + /* Offset from the start of the disk containing the local file header to the + * start of the local file header. + */ + __u32 offset; +} __attribute__((packed)); + +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50 + +/* See section 4.3.7 of the spec. */ +struct local_file_header { + /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */ + __u32 magic; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; +} __attribute__((packed)); + +#pragma GCC diagnostic pop + +struct zip_archive { + void *data; + __u32 size; + __u32 cd_offset; + __u32 cd_records; +}; + +static void *check_access(struct zip_archive *archive, __u32 offset, __u32 size) +{ + if (offset + size > archive->size || offset > offset + size) + return NULL; + + return archive->data + offset; +} + +/* Returns 0 on success, -EINVAL on error and -ENOTSUP if the eocd indicates the + * archive uses features which are not supported. + */ +static int try_parse_end_of_cd(struct zip_archive *archive, __u32 offset) +{ + __u16 comment_length, cd_records; + struct end_of_cd_record *eocd; + __u32 cd_offset, cd_size; + + eocd = check_access(archive, offset, sizeof(*eocd)); + if (!eocd || eocd->magic != END_OF_CD_RECORD_MAGIC) + return -EINVAL; + + comment_length = eocd->comment_length; + if (offset + sizeof(*eocd) + comment_length != archive->size) + return -EINVAL; + + cd_records = eocd->cd_records; + if (eocd->this_disk != 0 || eocd->cd_disk != 0 || eocd->cd_records_total != cd_records) + /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */ + return -ENOTSUP; + + cd_offset = eocd->cd_offset; + cd_size = eocd->cd_size; + if (!check_access(archive, cd_offset, cd_size)) + return -EINVAL; + + archive->cd_offset = cd_offset; + archive->cd_records = cd_records; + return 0; +} + +static int find_cd(struct zip_archive *archive) +{ + int64_t limit, offset; + int rc = -EINVAL; + + if (archive->size <= sizeof(struct end_of_cd_record)) + return -EINVAL; + + /* Because the end of central directory ends with a variable length array of + * up to 0xFFFF bytes we can't know exactly where it starts and need to + * search for it at the end of the file, scanning the (limit, offset] range. + */ + offset = archive->size - sizeof(struct end_of_cd_record); + limit = (int64_t)offset - (1 << 16); + + for (; offset >= 0 && offset > limit && rc != 0; offset--) { + rc = try_parse_end_of_cd(archive, offset); + if (rc == -ENOTSUP) + break; + } + return rc; +} + +struct zip_archive *zip_archive_open(const char *path) +{ + struct zip_archive *archive; + int err, fd; + off_t size; + void *data; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return ERR_PTR(-errno); + + size = lseek(fd, 0, SEEK_END); + if (size == (off_t)-1 || size > UINT32_MAX) { + close(fd); + return ERR_PTR(-EINVAL); + } + + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + err = -errno; + close(fd); + + if (data == MAP_FAILED) + return ERR_PTR(err); + + archive = malloc(sizeof(*archive)); + if (!archive) { + munmap(data, size); + return ERR_PTR(-ENOMEM); + }; + + archive->data = data; + archive->size = size; + + err = find_cd(archive); + if (err) { + munmap(data, size); + free(archive); + return ERR_PTR(err); + } + + return archive; +} + +void zip_archive_close(struct zip_archive *archive) +{ + munmap(archive->data, archive->size); + free(archive); +} + +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive, + __u32 offset) +{ + struct local_file_header *lfh; + + lfh = check_access(archive, offset, sizeof(*lfh)); + if (!lfh || lfh->magic != LOCAL_FILE_HEADER_MAGIC) + return NULL; + + return lfh; +} + +static int get_entry_at_offset(struct zip_archive *archive, __u32 offset, struct zip_entry *out) +{ + struct local_file_header *lfh; + __u32 compressed_size; + const char *name; + void *data; + + lfh = local_file_header_at_offset(archive, offset); + if (!lfh) + return -EINVAL; + + offset += sizeof(*lfh); + if ((lfh->flags & FLAG_ENCRYPTED) || (lfh->flags & FLAG_HAS_DATA_DESCRIPTOR)) + return -EINVAL; + + name = check_access(archive, offset, lfh->file_name_length); + if (!name) + return -EINVAL; + + offset += lfh->file_name_length; + if (!check_access(archive, offset, lfh->extra_field_length)) + return -EINVAL; + + offset += lfh->extra_field_length; + compressed_size = lfh->compressed_size; + data = check_access(archive, offset, compressed_size); + if (!data) + return -EINVAL; + + out->compression = lfh->compression; + out->name_length = lfh->file_name_length; + out->name = name; + out->data = data; + out->data_length = compressed_size; + out->data_offset = offset; + + return 0; +} + +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name, + struct zip_entry *out) +{ + size_t file_name_length = strlen(file_name); + __u32 i, offset = archive->cd_offset; + + for (i = 0; i < archive->cd_records; ++i) { + __u16 cdfh_name_length, cdfh_flags; + struct cd_file_header *cdfh; + const char *cdfh_name; + + cdfh = check_access(archive, offset, sizeof(*cdfh)); + if (!cdfh || cdfh->magic != CD_FILE_HEADER_MAGIC) + return -EINVAL; + + offset += sizeof(*cdfh); + cdfh_name_length = cdfh->file_name_length; + cdfh_name = check_access(archive, offset, cdfh_name_length); + if (!cdfh_name) + return -EINVAL; + + cdfh_flags = cdfh->flags; + if ((cdfh_flags & FLAG_ENCRYPTED) == 0 && + (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 && + file_name_length == cdfh_name_length && + memcmp(file_name, archive->data + offset, file_name_length) == 0) { + return get_entry_at_offset(archive, cdfh->offset, out); + } + + offset += cdfh_name_length; + offset += cdfh->extra_field_length; + offset += cdfh->file_comment_length; + } + + return -ENOENT; +} diff --git a/src/zip.h b/src/zip.h new file mode 100644 index 0000000..1c1bb21 --- /dev/null +++ b/src/zip.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LIBBPF_ZIP_H +#define __LIBBPF_ZIP_H + +#include + +/* Represents an open zip archive. + * Only basic ZIP files are supported, in particular the following are not + * supported: + * - encryption + * - streaming + * - multi-part ZIP files + * - ZIP64 + */ +struct zip_archive; + +/* Carries information on name, compression method, and data corresponding to a + * file in a zip archive. + */ +struct zip_entry { + /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */ + __u16 compression; + + /* Non-null terminated name of the file. */ + const char *name; + /* Length of the file name. */ + __u16 name_length; + + /* Pointer to the file data. */ + const void *data; + /* Length of the file data. */ + __u32 data_length; + /* Offset of the file data within the archive. */ + __u32 data_offset; +}; + +/* Open a zip archive. Returns NULL in case of an error. */ +struct zip_archive *zip_archive_open(const char *path); + +/* Close a zip archive and release resources. */ +void zip_archive_close(struct zip_archive *archive); + +/* Look up an entry corresponding to a file in given zip archive. */ +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out); + +#endif