Skip to content

Commit

Permalink
Merge pull request #2155 from r-sm2024/vmray_extractor
Browse files Browse the repository at this point in the history
Add VMRayAnalysis model and call parser
  • Loading branch information
mr-tz authored Jun 19, 2024
2 parents 21887d1 + 0c9d3d0 commit 8757dad
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 16 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: CI

on:
push:
branches: [ master ]
branches: [ master, vmray-extractor ]
pull_request:
branches: [ master ]
branches: [ master, vmray-extractor ]

permissions: read-all

Expand Down
58 changes: 58 additions & 0 deletions capa/features/extractors/vmray/call.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import logging
from typing import Tuple, Iterator

from capa.helpers import assert_never
from capa.features.insn import API, Number
from capa.features.common import String, Feature
from capa.features.address import Address
from capa.features.extractors.vmray.models import Analysis
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle

logger = logging.getLogger(__name__)


def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
"""
this method extracts the given call's features (such as API name and arguments),
and returns them as API, Number, and String features.
args:
call: FunctionCall object representing the XML fncall element
yields: Feature, address; where Feature is either: API, Number, or String.
"""

# Extract API name
yield API(ch.inner.name), ch.inner.address

# Extract arguments from <in>
for param in ch.inner.in_:
value = param.value
if isinstance(value, str):
yield String(value), ch.inner.address

elif isinstance(value, int):
yield Number(value), ch.inner.address

else:
assert_never(value)

# Extract return value from <out>
if ch.inner.out is not None:
value = ch.inner.out.value
if isinstance(value, str):
yield String(value), ch.inner.address

elif isinstance(value, int):
yield Number(value), ch.inner.address

else:
assert_never(value)


def extract_features(analysis: Analysis) -> Iterator[Tuple[Feature, Address]]:
"""
Extract features from the Analysis object in models.py
"""
for fncall in analysis.fncalls:
yield from extract_function_calls(fncall)
13 changes: 13 additions & 0 deletions capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from pathlib import Path
from zipfile import ZipFile

from devtools import debug, pprint

import capa.helpers
import capa.features.extractors.vmray.file
import capa.features.extractors.vmray.global_
Expand Down Expand Up @@ -97,4 +99,15 @@ def from_zipfile(cls, zipfile_path: Path):
flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
flog = Analysis.from_xml(flog_xml)

debug(flog.processes[1])
pprint(flog.processes[0])

return cls(VMRayAnalysis(sv2, flog))


if __name__ == "__main__":
# TODO(mr): for testing, removeme
import sys

input_path = Path(sys.argv[1])
VMRayExtractor.from_zipfile(input_path)
79 changes: 65 additions & 14 deletions capa/features/extractors/vmray/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,102 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.

from typing import Dict, List, Optional

from pydantic import BaseModel

# TODO install/force lxml?
from pydantic_xml import BaseXmlModel, attr, element


### models for flog.xml
class Param(BaseXmlModel, tag="param"):
name: str = attr()
type: str = attr()
value: Optional[str] = attr(default=None)


# or see https://pydantic-xml.readthedocs.io/en/latest/pages/quickstart.html#wrapper
class In(BaseXmlModel, tag="in"):
params: List[Param] = element(name="in")


class Out(BaseXmlModel, tag="out"):
params: List[Param] = element(name="out")


class FunctionCall(BaseXmlModel, tag="fncall"):
# ts: str = attr()
# fncall_id: int = attr()
# process_id: int = attr()
name: str = attr()
# in_: element(name="in")
# out: element()
ts: int = attr()
fncall_id: int = attr()
process_id: int = attr()
thread_id: int = attr()
name: str = attr() # API call name?
address: str = attr(name="addr")
from_: str = attr(name="from")
in_: Optional[In] = element(tag="in", default=None)
out_: Optional[Out] = element(tag="out", default=None)


# note that not all fncalls always have an associated fnret, e.g. exit or WaitForSingleObject
class FunctionReturn(BaseXmlModel, tag="fnret"):
ts: int = attr()
fncall_id: int = attr()
address: str = attr(name="addr") # string that contains a hex value
from_: str = attr(name="from") # string that contains a hex value


# TODO check multiple are there
class MonitorProcess(BaseXmlModel, tag="monitor_process"):
ts: str = attr()
ts: int = attr()
process_id: int = attr()
image_name: str = attr()


# TODO check multiple are there
class MonitorThread(BaseXmlModel, tag="monitor_thread"):
ts: str = attr()
ts: int = attr()
thread_id: int = attr()
process_id: int = attr()
os_tid: str = attr() # TODO hex


class Analysis(BaseXmlModel, tag="analysis"):
class NewRegion(BaseXmlModel, tag="new_region"):
ts: int = attr()
region_id: int = attr()
process_id: int = attr()
start_va: str = attr()
end_va: str = attr()
entry_point: str = attr()


class RemoveRegion(BaseXmlModel, tag="remove_region"):
ts: int = attr()
region_id: int = attr()


# unordered is very slow, but elements may occur in any order
class Analysis(BaseXmlModel, tag="analysis", search_mode="unordered"):
log_version: str = attr()
analyzer_version: str = attr()
analysis_date: str = attr()

# super slow
# data: List[Union[MonitorProcess, MonitorThread, NewRegion, RemoveRegion, FunctionCall, FunctionReturn]]

# may want to preprocess file and remove/reorder entries for more efficient parsing

processes: List[MonitorProcess] = element(tag="monitor_process")
threads: List[MonitorThread] = element(tag="monitor_thread")
# failing so far...
# fncall: List[FunctionCall] = element(tag="fncall")

# not important and slow down parsing
# new_regions: List[NewRegion] = element(tag="new_region")
# remove_regions: List[RemoveRegion] = element(tag="remove_region")

### models for summary_v2.json files
# very slow alternative; calls: List[Union[FunctionCall, FunctionReturn]]
fncalls: List[FunctionCall] = element(tag="fncall")
fnrets: List[FunctionReturn] = element(tag="fnret")


### models for summary_v2.json files
class GenericReference(BaseModel):
path: List[str]
source: str
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ dependencies = [
"rich>=13",
"humanize>=4",
"protobuf>=5",
"pydantic_xml[lxml]>=2.11", # TODO benchmark lxml vs. elementtree - first impression eltree faster

# ---------------------------------------
# Dependencies that we develop
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pyasn1-modules==0.2.8
pycparser==2.22
pydantic==2.7.3
pydantic-core==2.18.4
pydantic-xml==2.11.0
pyelftools==0.31
pygments==2.18.0
python-flirt==0.8.10
Expand Down

0 comments on commit 8757dad

Please sign in to comment.