Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate OCI and Docker images from PROBE record #64

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,11 @@ $ probe dump

That's a huge [work in progress](https://github.com/charmoniumQ/PROBE/pulls).

We're starting out with just "analysis" of the provenance. Does this input file influence that output file in the PROBEd process? Run
Try exporting to different formats.


``` bash
nix shell nixpkgs#graphviz github:charmoniumQ/PROBE#probe-py-manual \
--command sh -c 'python -m probe_py.manual.cli process-graph | tee /dev/stderr | dot -Tpng -ooutput.png /dev/stdin'
probe export --help
```

## Developing PROBE
Expand Down
13 changes: 10 additions & 3 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@
${frontend.packages.probe-cli}/bin/probe \
$out/bin/probe \
--set __PROBE_LIB ${libprobe}/lib \
--prefix PATH : ${probe-py}/bin
--prefix PATH : ${probe-py}/bin \
--prefix PATH : ${pkgs.buildah}/bin
'';
};
probe-py-generated = frontend.packages.probe-py-generated;
Expand Down Expand Up @@ -157,7 +158,12 @@
probe-integration-tests = pkgs.stdenv.mkDerivation {
name = "probe-integration-tests";
src = ./probe_src/tests;
nativeBuildInputs = [packages.probe-bundled packages.probe-py];
nativeBuildInputs = [
packages.probe-bundled
packages.probe-py
pkgs.podman
pkgs.docker
];
buildPhase = "touch $out";
checkPhase = ''
pytest .
Expand All @@ -182,7 +188,6 @@
pkgs.cargo-expand
pkgs.cargo-flamegraph
pkgs.cargo-watch
pkgs.gdb
pkgs.rust-analyzer

(python.withPackages (pypkgs: [
Expand All @@ -205,6 +210,7 @@

# (export-and-rename python312-debug [["bin/python" "bin/python-dbg"]])

pkgs.buildah
pkgs.which
pkgs.gnumake
pkgs.gcc
Expand All @@ -216,6 +222,7 @@
pkgs.ruff
pkgs.cachix
pkgs.jq # to make cachix work
pkgs.podman
]
# gdb broken on i686
++ pkgs.lib.lists.optional (system != "i686-linux") pkgs.nextflow
Expand Down
7 changes: 7 additions & 0 deletions lightweight_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

# nix develop brings in a ton of stuff to the env
# which complicates testing probe
# To simplify, use this script.

env - __PROBE_LIB=$__PROBE_LIB PATH=$PATH PYTHONPATH=$PYTHONPATH $@
2 changes: 2 additions & 0 deletions probe_src/frontend/cli/src/dump.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use serde::{Deserialize, Serialize};
///
/// This hides some of the data and so is not suitable for machine consumption use
/// [`to_stdout_json()`] instead.
#[allow(dead_code)]
pub fn to_stdout<P: AsRef<Path>>(tar_path: P) -> Result<()> {
dump_internal(tar_path, |(pid, epoch, tid), ops| {
let mut stdout = std::io::stdout().lock();
Expand All @@ -33,6 +34,7 @@ pub fn to_stdout<P: AsRef<Path>>(tar_path: P) -> Result<()> {
/// ```
///
/// (without whitespace)
#[allow(dead_code)]
pub fn to_stdout_json<P: AsRef<Path>>(tar_path: P) -> Result<()> {
dump_internal(tar_path, |(pid, epoch, tid), ops| {
let mut stdout = std::io::stdout().lock();
Expand Down
27 changes: 3 additions & 24 deletions probe_src/frontend/cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,9 @@ fn main() -> Result<()> {
.value_parser(value_parser!(OsString)),
])
.about("Convert PROBE records to PROBE logs."),
// Command::new("dump")
// .args([
// arg!(--json "Output JSON.")
// .required(false)
// .value_parser(value_parser!(bool)),
// arg!(-i --input <PATH> "Path to load PROBE log from.")
// .required(false)
// .default_value("probe_log")
// .value_parser(value_parser!(OsString)),
// ])
// .about("Write the data from probe log data in a human-readable manner"),
// TODO: Dump is temporarily broken by https://github.com/charmoniumQ/PROBE/pull/60.
// For now, we can just use tar xvf or analysis.generated.parse_prov_log(...) instead
/* No more probe dump in Rust.
* See `probe export debug-text` in Python.
* */
Command::new("__gdb-exec-shim").hide(true).arg(
arg!(<CMD> ... "Command to run")
.required(true)
Expand Down Expand Up @@ -127,17 +117,6 @@ fn main() -> Result<()> {
.and_then(|mut tar| transcribe::transcribe(input, &mut tar))
.wrap_err("Transcribe command failed")
}
Some(("dump", sub)) => {
let json = sub.get_flag("json");
let input = sub.get_one::<OsString>("input").unwrap().clone();

if json {
dump::to_stdout_json(input)
} else {
dump::to_stdout(input)
}
.wrap_err("Dump command failed")
}
Some(("__gdb-exec-shim", sub)) => {
let cmd = sub
.get_many::<OsString>("CMD")
Expand Down
113 changes: 57 additions & 56 deletions probe_src/frontend/python/probe_py/generated/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from __future__ import annotations
import os
import contextlib
import tempfile
import pathlib
import typing
import json
import tarfile
from dataclasses import dataclass
from dataclasses import dataclass, replace
from . import ops

@dataclass(frozen=True)
Expand Down Expand Up @@ -31,68 +35,65 @@ class InodeVersionLog:
tv_nsec: int
size: int

@staticmethod
def from_path(path: pathlib.Path) -> InodeVersionLog:
s = path.stat()
return InodeVersionLog(
os.major(s.st_dev),
os.minor(s.st_dev),
s.st_ino,
s.st_mtime_ns // int(1e9),
s.st_mtime_ns % int(1e9),
s.st_size,
)


@dataclass(frozen=True)
class ProvLog:
processes: typing.Mapping[int, ProcessProvLog]
inodes: typing.Mapping[InodeVersionLog, str]
inodes: typing.Mapping[InodeVersionLog, pathlib.Path]
has_inodes: bool

def parse_probe_log(probe_log: pathlib.Path) -> ProvLog:
op_map = dict[int, dict[int, dict[int, ThreadProvLog]]]()
inodes = dict[InodeVersionLog, str]()
has_inodes = False

tar = tarfile.open(probe_log, mode='r')

for item in tar:
# items with size zero are directories in the tarball
if item.size == 0:
continue

# extract and name the hierarchy components
parts = item.name.split("/")
if parts[0] == "info":
if parts[1] == "copy_files":
has_inodes = True
elif parts[0] == "inodes":
if len(parts) != 2:
raise RuntimeError("Invalid probe_log")
inodes[InodeVersionLog(*[
@contextlib.contextmanager
def parse_probe_log_ctx(
probe_log: pathlib.Path,
) -> typing.Iterator[ProvLog]:
"""Parse probe log; return provenance data and inode contents"""
with tempfile.TemporaryDirectory() as _tmpdir:
tmpdir = pathlib.Path(_tmpdir)
with tarfile.open(probe_log, mode="r") as tar:
tar.extractall(tmpdir, filter="data")
has_inodes = (tmpdir / "info" / "copy_files").exists()
inodes = {
InodeVersionLog(*[
int(segment, 16)
for segment in parts[1].split("-")
])] = item.name
elif parts[0] == "pids":
if len(parts) != 4:
raise RuntimeError("Invalid probe_log")
pid: int = int(parts[1])
epoch: int = int(parts[2])
tid: int = int(parts[3])

# extract file contents as byte buffer
file = tar.extractfile(item)
if file is None:
raise IOError("Unable to read jsonlines from probe log")

# read, split, comprehend, deserialize, extend
jsonlines = file.read().strip().split(b"\n")
ops = ThreadProvLog(tid, [json.loads(x, object_hook=op_hook) for x in jsonlines])
op_map.setdefault(pid, {}).setdefault(epoch, {})[tid] = ops

return ProvLog(
processes={
pid: ProcessProvLog(
pid,
{
epoch: ExecEpochProvLog(epoch, threads)
for epoch, threads in epochs.items()
},
)
for pid, epochs in op_map.items()
},
inodes=inodes,
has_inodes=has_inodes,
)
for segment in file.name.split("-")
]): file
for file in (tmpdir / "inodes").iterdir()
} if (tmpdir / "inodes").exists() else {}

processes = {}
for pid_dir in (tmpdir / "pids").iterdir():
pid = int(pid_dir.name)
epochs = {}
for epoch_dir in pid_dir.iterdir():
epoch = int(epoch_dir.name)
tids = {}
for tid_file in epoch_dir.iterdir():
tid = int(tid_file.name)
# read, split, comprehend, deserialize, extend
jsonlines = tid_file.read_text().strip().split("\n")
tids[tid] = ThreadProvLog(tid, [json.loads(x, object_hook=op_hook) for x in jsonlines])
epochs[epoch] = ExecEpochProvLog(epoch, tids)
processes[pid] = ProcessProvLog(pid, epochs)
yield ProvLog(processes, inodes, has_inodes)

def parse_probe_log(
probe_log: pathlib.Path,
) -> ProvLog:
"""Parse probe log; return provenance data, but throw away inode contents"""
with parse_probe_log_ctx(probe_log) as prov_log:
return replace(prov_log, has_inodes=False, inodes={})

def op_hook(json_map: typing.Dict[str, typing.Any]) -> typing.Any:
ty: str = json_map["_type"]
Expand Down
5 changes: 3 additions & 2 deletions probe_src/libprobe/src/lookup_on_path.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ static bool lookup_on_path(BORROWED const char* bin_name, BORROWED char* bin_pat
/* TODO: Test case where PATH starts with : */
path_segment_start++;
}
bool has_elements = path_segment_start == NULL || path_segment_start[0] == '\0';
bool has_elements = path_segment_start != NULL && path_segment_start[0] != '\0';
/* TODO: Use default PATH when PATH is unset */
if (has_elements) {
while (true) {
Expand All @@ -29,10 +29,11 @@ static bool lookup_on_path(BORROWED const char* bin_name, BORROWED char* bin_pat
op.data.access.ferrno = errno;
prov_log_record(op);
}
while (path_segment_start[0] == ':') {
while (path_segment_start[0] != ':') {
/* TODO: Test case where PATH segment contains empty strings, /foo/bin:::/bar/bin */
path_segment_start++;
}
path_segment_start++;
if (path_segment_start[0] == '\0') {
break;
}
Expand Down
2 changes: 2 additions & 0 deletions probe_src/libprobe/src/prov_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ static struct Path create_path_lazy(int dirfd, BORROWED const char* path, int fl
ret.ctime = statx_buf.stx_ctime;
ret.size = statx_buf.stx_size;
ret.stat_valid = true;
} else {
DEBUG("Stat of %d,%s is not valid", dirfd, path);
}
return ret;
} else {
Expand Down
Loading
Loading