Merge pull request #2155 from r-sm2024/vmray_extractor

Add VMRayAnalysis model and call parser
mandiant · Jun 19, 2024 · 8757dad · 8757dad
2 parents 21887d1 + 0c9d3d0
commit 8757dad
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 16 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -2,9 +2,9 @@ name: CI
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, vmray-extractor ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, vmray-extractor ]
 
 permissions: read-all
 

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
@@ -0,0 +1,58 @@
+import logging
+from typing import Tuple, Iterator
+
+from capa.helpers import assert_never
+from capa.features.insn import API, Number
+from capa.features.common import String, Feature
+from capa.features.address import Address
+from capa.features.extractors.vmray.models import Analysis
+from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
+
+logger = logging.getLogger(__name__)
+
+
+def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
+    """
+    this method extracts the given call's features (such as API name and arguments),
+    and returns them as API, Number, and String features.
+
+    args:
+      call: FunctionCall object representing the XML fncall element
+
+      yields: Feature, address; where Feature is either: API, Number, or String.
+    """
+
+    # Extract API name
+    yield API(ch.inner.name), ch.inner.address
+
+    # Extract arguments from <in>
+    for param in ch.inner.in_:
+        value = param.value
+        if isinstance(value, str):
+            yield String(value), ch.inner.address
+
+        elif isinstance(value, int):
+            yield Number(value), ch.inner.address
+
+        else:
+            assert_never(value)
+
+    # Extract return value from <out>
+    if ch.inner.out is not None:
+        value = ch.inner.out.value
+        if isinstance(value, str):
+            yield String(value), ch.inner.address
+
+        elif isinstance(value, int):
+            yield Number(value), ch.inner.address
+
+        else:
+            assert_never(value)
+
+
+def extract_features(analysis: Analysis) -> Iterator[Tuple[Feature, Address]]:
+    """
+    Extract features from the Analysis object in models.py
+    """
+    for fncall in analysis.fncalls:
+        yield from extract_function_calls(fncall)
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
@@ -11,6 +11,8 @@
 from pathlib import Path
 from zipfile import ZipFile
 
+from devtools import debug, pprint
+
 import capa.helpers
 import capa.features.extractors.vmray.file
 import capa.features.extractors.vmray.global_
@@ -97,4 +99,15 @@ def from_zipfile(cls, zipfile_path: Path):
             flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
             flog = Analysis.from_xml(flog_xml)
 
+            debug(flog.processes[1])
+            pprint(flog.processes[0])
+
         return cls(VMRayAnalysis(sv2, flog))
+
+
+if __name__ == "__main__":
+    # TODO(mr): for testing, removeme
+    import sys
+
+    input_path = Path(sys.argv[1])
+    VMRayExtractor.from_zipfile(input_path)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
@@ -5,51 +5,102 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+
 from typing import Dict, List, Optional
 
 from pydantic import BaseModel
-
-# TODO install/force lxml?
 from pydantic_xml import BaseXmlModel, attr, element
 
+
 ### models for flog.xml
+class Param(BaseXmlModel, tag="param"):
+    name: str = attr()
+    type: str = attr()
+    value: Optional[str] = attr(default=None)
+
+
+# or see https://pydantic-xml.readthedocs.io/en/latest/pages/quickstart.html#wrapper
+class In(BaseXmlModel, tag="in"):
+    params: List[Param] = element(name="in")
+
+
+class Out(BaseXmlModel, tag="out"):
+    params: List[Param] = element(name="out")
 
 
 class FunctionCall(BaseXmlModel, tag="fncall"):
-    # ts: str = attr()
-    # fncall_id: int = attr()
-    # process_id: int = attr()
-    name: str = attr()
-    # in_: element(name="in")
-    # out: element()
+    ts: int = attr()
+    fncall_id: int = attr()
+    process_id: int = attr()
+    thread_id: int = attr()
+    name: str = attr()  # API call name?
+    address: str = attr(name="addr")
+    from_: str = attr(name="from")
+    in_: Optional[In] = element(tag="in", default=None)
+    out_: Optional[Out] = element(tag="out", default=None)
+
 
+# note that not all fncalls always have an associated fnret, e.g. exit or WaitForSingleObject
+class FunctionReturn(BaseXmlModel, tag="fnret"):
+    ts: int = attr()
+    fncall_id: int = attr()
+    address: str = attr(name="addr")  # string that contains a hex value
+    from_: str = attr(name="from")  # string that contains a hex value
 
+
+# TODO check multiple are there
 class MonitorProcess(BaseXmlModel, tag="monitor_process"):
-    ts: str = attr()
+    ts: int = attr()
     process_id: int = attr()
     image_name: str = attr()
 
 
+# TODO check multiple are there
 class MonitorThread(BaseXmlModel, tag="monitor_thread"):
-    ts: str = attr()
+    ts: int = attr()
     thread_id: int = attr()
     process_id: int = attr()
     os_tid: str = attr()  # TODO hex
 
 
-class Analysis(BaseXmlModel, tag="analysis"):
+class NewRegion(BaseXmlModel, tag="new_region"):
+    ts: int = attr()
+    region_id: int = attr()
+    process_id: int = attr()
+    start_va: str = attr()
+    end_va: str = attr()
+    entry_point: str = attr()
+
+
+class RemoveRegion(BaseXmlModel, tag="remove_region"):
+    ts: int = attr()
+    region_id: int = attr()
+
+
+# unordered is very slow, but elements may occur in any order
+class Analysis(BaseXmlModel, tag="analysis", search_mode="unordered"):
     log_version: str = attr()
     analyzer_version: str = attr()
     analysis_date: str = attr()
+
+    # super slow
+    # data: List[Union[MonitorProcess, MonitorThread, NewRegion, RemoveRegion, FunctionCall, FunctionReturn]]
+
+    # may want to preprocess file and remove/reorder entries for more efficient parsing
+
     processes: List[MonitorProcess] = element(tag="monitor_process")
     threads: List[MonitorThread] = element(tag="monitor_thread")
-    # failing so far...
-    # fncall: List[FunctionCall] = element(tag="fncall")
 
+    # not important and slow down parsing
+    # new_regions: List[NewRegion] = element(tag="new_region")
+    # remove_regions: List[RemoveRegion] = element(tag="remove_region")
 
-### models for summary_v2.json files
+    # very slow alternative; calls: List[Union[FunctionCall, FunctionReturn]]
+    fncalls: List[FunctionCall] = element(tag="fncall")
+    fnrets: List[FunctionReturn] = element(tag="fnret")
 
 
+### models for summary_v2.json files
 class GenericReference(BaseModel):
     path: List[str]
     source: str

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,6 +79,7 @@ dependencies = [
     "rich>=13",
     "humanize>=4",
     "protobuf>=5",
+    "pydantic_xml[lxml]>=2.11",  # TODO benchmark lxml vs. elementtree - first impression eltree faster
 
     # ---------------------------------------
     # Dependencies that we develop

diff --git a/requirements.txt b/requirements.txt
@@ -28,6 +28,7 @@ pyasn1-modules==0.2.8
 pycparser==2.22
 pydantic==2.7.3
 pydantic-core==2.18.4
+pydantic-xml==2.11.0
 pyelftools==0.31
 pygments==2.18.0
 python-flirt==0.8.10