-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
82ea80e
commit ef63ddc
Showing
7 changed files
with
276 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,28 @@ | ||
from typing_extensions import Annotated | ||
import typing | ||
import pathlib | ||
import dataclasses | ||
import typer | ||
import shutil | ||
import json | ||
import rich | ||
from probe_py.manual.scp import scp_with_provenance | ||
import rich.console | ||
import rich.pretty | ||
from ..generated.parser import parse_probe_log, parse_probe_log_ctx | ||
from . import analysis | ||
from .workflows import MakefileGenerator | ||
from .workflows import MakefileGenerator, NextflowGenerator | ||
from .ssh_argparser import parse_ssh_args | ||
from . import file_closure | ||
from . import graph_utils | ||
import subprocess | ||
import os | ||
import tempfile | ||
import enum | ||
from .persistent_provenance_db import Process, ProcessInputs, ProcessThatWrites, get_engine | ||
from sqlalchemy.orm import Session | ||
from .analysis import ProcessNode, FileNode | ||
import shlex | ||
import datetime | ||
import random | ||
import socket | ||
|
||
|
||
console = rich.console.Console(stderr=True) | ||
|
@@ -110,6 +116,68 @@ def dataflow_graph( | |
dataflow_graph = analysis.provlog_to_dataflow_graph(prov_log) | ||
graph_utils.serialize_graph(dataflow_graph, output) | ||
|
||
def get_host_name() -> int: | ||
hostname = socket.gethostname() | ||
rng = random.Random(int(datetime.datetime.now().timestamp()) ^ hash(hostname)) | ||
bits_per_hex_digit = 4 | ||
hex_digits = 8 | ||
random_number = rng.getrandbits(bits_per_hex_digit * hex_digits) | ||
return random_number | ||
|
||
@export_app.command() | ||
def store_dataflow_graph(probe_log: Annotated[ | ||
pathlib.Path, | ||
typer.Argument(help="output file written by `probe record -o $file`."), | ||
] = pathlib.Path("probe_log"))->None: | ||
prov_log = parse_probe_log(probe_log) | ||
dataflow_graph = analysis.provlog_to_dataflow_graph(prov_log) | ||
engine = get_engine() | ||
with Session(engine) as session: | ||
for node in dataflow_graph.nodes(): | ||
if isinstance(node, ProcessNode): | ||
print(node) | ||
new_process = Process(process_id = int(node.pid), parent_process_id = 0, cmd = shlex.join(node.cmd), time = datetime.datetime.now()) | ||
session.add(new_process) | ||
|
||
for (node1, node2) in dataflow_graph.edges(): | ||
if isinstance(node1, ProcessNode) and isinstance(node2, ProcessNode): | ||
parent_process_id = node1.pid | ||
child_process = session.get(Process, node2.pid) | ||
if child_process: | ||
child_process.parent_process_id = parent_process_id | ||
|
||
elif isinstance(node1, ProcessNode) and isinstance(node2, FileNode): | ||
inode_info = node2.inodeOnDevice | ||
host = get_host_name() | ||
stat_info = os.stat(node2.file) | ||
mtime = int(stat_info.st_mtime * 1_000_000_000) | ||
size = stat_info.st_size | ||
new_output_inode = ProcessThatWrites(inode = inode_info.inode, process_id = node1.pid, device_major = inode_info.device_major, device_minor = inode_info.device_minor, host = host, path = node2.file, mtime = mtime, size = size) | ||
session.add(new_output_inode) | ||
|
||
elif isinstance(node1, FileNode) and isinstance(node2, ProcessNode): | ||
inode_info = node1.inodeOnDevice | ||
host = get_host_name() | ||
stat_info = os.stat(node1.file) | ||
mtime = int(stat_info.st_mtime * 1_000_000_000) | ||
size = stat_info.st_size | ||
new_input_inode = ProcessInputs(inode = inode_info.inode, process_id=node2.pid, device_major=inode_info.device_major, device_minor= inode_info.device_minor, host = host, path = node1.file, mtime=mtime, size=size) | ||
session.add(new_input_inode) | ||
|
||
root_process = None | ||
for node in dataflow_graph.nodes(): | ||
if isinstance(node, ProcessNode): | ||
pid = node.pid | ||
process_record = session.get(Process, pid) | ||
if process_record and process_record.parent_process_id == 0: | ||
if root_process is not None: | ||
print(f"Error: Two parent processes - {pid} and {root_process}") | ||
session.rollback() | ||
return | ||
else: | ||
root_process = pid | ||
|
||
session.commit() | ||
|
||
@export_app.command() | ||
def debug_text( | ||
|
@@ -227,8 +295,8 @@ def ssh( | |
""" | ||
|
||
flags, destination, remote_host = parse_ssh_args(ssh_args) | ||
ssh_cmd = ["ssh"] + flags | ||
|
||
ssh_cmd = ["ssh"] + flags | ||
|
||
libprobe = pathlib.Path(os.environ["__PROBE_LIB"]) / ("libprobe-dbg.so" if debug else "libprobe.so") | ||
if not libprobe.exists(): | ||
|
@@ -324,45 +392,35 @@ def makefile( | |
script = g.generate_makefile(dataflow_graph) | ||
output.write_text(script) | ||
|
||
|
||
@export_app.command() | ||
def ops_jsonl( | ||
def nextflow( | ||
output: Annotated[ | ||
pathlib.Path, | ||
typer.Argument(), | ||
] = pathlib.Path("nextflow.nf"), | ||
probe_log: Annotated[ | ||
pathlib.Path, | ||
typer.Argument(help="output file written by `probe record -o $file`."), | ||
] = pathlib.Path("probe_log"), | ||
) -> None: | ||
""" | ||
Export each op to a JSON line | ||
Export the probe_log to a Nextflow workflow | ||
""" | ||
def filter_nested_dict( | ||
dct: typing.Mapping[typing.Any, typing.Any], | ||
) -> typing.Mapping[typing.Any, typing.Any]: | ||
return { | ||
key: ( | ||
filter_nested_dict(val) if isinstance(val, dict) else | ||
val.decode(errors="surrogateescape") if isinstance(val, bytes) else | ||
val | ||
) | ||
for key, val in dct.items() | ||
} | ||
stdout_console = rich.console.Console() | ||
prov_log = parse_probe_log(probe_log) | ||
for pid, process in prov_log.processes.items(): | ||
for exec_epoch_no, exec_epoch in process.exec_epochs.items(): | ||
for tid, thread in exec_epoch.threads.items(): | ||
for i, op in enumerate(thread.ops): | ||
stdout_console.print_json(json.dumps({ | ||
"pid": pid, | ||
"tid": tid, | ||
"exec_epoch_no": exec_epoch_no, | ||
"i": i, | ||
"op": filter_nested_dict( | ||
dataclasses.asdict(op), | ||
), | ||
"op_data_type": type(op.data).__name__, | ||
})) | ||
dataflow_graph = analysis.provlog_to_dataflow_graph(prov_log) | ||
g = NextflowGenerator() | ||
output = pathlib.Path("nextflow.nf") | ||
script = g.generate_workflow(dataflow_graph) | ||
output.write_text(script) | ||
|
||
# Example: scp Desktop/sample_example.txt [email protected]:/home/remote_dir | ||
@app.command( | ||
context_settings=dict( | ||
ignore_unknown_options=True, | ||
), | ||
) | ||
def scp(cmd: list[str]) -> None: | ||
scp_with_provenance(cmd) | ||
|
||
if __name__ == "__main__": | ||
app() | ||
|
Oops, something went wrong.