Skip to content

Commit

Permalink
Native library detection plugin (#267)
Browse files Browse the repository at this point in the history
Co-authored-by: Wangmo Tenzing <[email protected]>
  • Loading branch information
wangmot and wangmot authored Dec 17, 2024
1 parent d84d6a0 commit bff6a2f
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 0 deletions.
92 changes: 92 additions & 0 deletions scripts/native_libraries/get_emba_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json
import os
import re

import requests

from surfactant.configmanager import ConfigManager


def load_database(url):
response = requests.get(url)
response.raise_for_status()
return response.text


def parse_cfg_file(content):
database = {}
lines = content.splitlines()
filtered_lines = []

for line in lines:
if not (line.startswith("#") or line.startswith("identifier")):
filtered_lines.append(line)

for line in filtered_lines:
line = line.strip()

# Split by semicolons
fields = line.split(";")

# Name of library
lib_name = fields[0]

# Empty filename because EMBA doesn't need filename patterns
name_patterns = []

# Check if it starts with one double quote and ends with two double quotes
if fields[3].startswith('"') and fields[3].endswith('""'):
filecontent = fields[3][1:-1]
elif fields[3].endswith('""'):
filecontent = fields[3][:-1]
else:
filecontent = fields[3].strip('"')

# Create a dictionary for this entry and add it to the database
# Strict mode is deprecated so those entries will be matched just by filename
if fields[1] == "" or fields[1] == "strict":
if fields[1] == "strict":
if lib_name not in database:
database[lib_name] = {
"filename": [lib_name],
"filecontent": [],
}
else:
try:
re.search(filecontent.encode("utf-8"), b"")
if lib_name not in database:
database[lib_name] = {
"filename": name_patterns,
"filecontent": [filecontent],
}
else:
database[lib_name]["filecontent"].append(filecontent)
except re.error as e:
print(f"Error parsing file content regexp {filecontent}: {e}")

return database


# Use database from this specific commit
emba_database_url = "https://raw.githubusercontent.com/e-m-b-a/emba/11d6c281189c3a14fc56f243859b0bccccce8b9a/config/bin_version_strings.cfg"
json_file_path = ConfigManager().get_data_dir_path() / "native_lib_patterns" / "emba.json"

file_content = load_database(emba_database_url)

parsed_data = parse_cfg_file(file_content)

for _, value in parsed_data.items():
filecontent_list = value["filecontent"]

# Remove leading ^ from each string in the filecontent list
for i, pattern in enumerate(filecontent_list): # Use enumerate to get index and value
if pattern.startswith("^"):
filecontent_list[i] = pattern[1:]

if not pattern.endswith("\\$"):
if pattern.endswith("$"):
filecontent_list[i] = pattern[:-1]

os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
with open(json_file_path, "w") as json_file:
json.dump(parsed_data, json_file, indent=4)
104 changes: 104 additions & 0 deletions surfactant/infoextractors/native_lib_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import json
import os
import re
from typing import Any, Dict, List, Optional

from loguru import logger

import surfactant.plugin
from surfactant.configmanager import ConfigManager
from surfactant.sbomtypes import SBOM, Software


@surfactant.plugin.hookimpl
def short_name() -> Optional[str]:
return "native_lib_patterns"


def load_pattern_db():
# Load regex patterns into database var
try:
with open(native_lib_patterns, "r") as regex:
emba_patterns = json.load(regex)
return emba_patterns
except FileNotFoundError:
logger.warning(f"File not found for native library detection: {native_lib_patterns}")
return None


# Load the pattern database once at module import
native_lib_patterns = ConfigManager().get_data_dir_path() / "native_lib_patterns" / "emba.json"
database = load_pattern_db()


def supports_file(filetype) -> bool:
return filetype in ("PE", "ELF", "MACHOFAT", "MACHOFAT64", "MACHO32", "MACHO64")


@surfactant.plugin.hookimpl
def extract_file_info(sbom: SBOM, software: Software, filename: str, filetype: str) -> object:
if not supports_file(filetype):
return None
return extract_native_lib_info(filename)


def extract_native_lib_info(filename):
native_lib_info: Dict[str, Any] = {"nativeLibraries": []}
if not database:
return None

found_libraries = set()
library_names = []
contains_library_names = []

# Match based on filename
base_filename = os.path.basename(filename)
filenames_list = match_by_attribute("filename", base_filename, database)
if len(filenames_list) > 0:
for match in filenames_list:
library_name = match["isLibrary"]
if library_name not in found_libraries:
library_names.append(library_name)
found_libraries.add(library_name)

# Match based on filecontent
try:
with open(filename, "rb") as native_file:
filecontent = native_file.read()
filecontent_list = match_by_attribute("filecontent", filecontent, database)

# Extend the list and add the new libraries found
for match in filecontent_list:
library_name = match["containsLibrary"]
if library_name not in found_libraries:
contains_library_names.append(library_name)
found_libraries.add(library_name)

except FileNotFoundError:
logger.warning(f"File not found: {filename}")

# Create the single entry for isLibrary
if library_names:
native_lib_info["nativeLibraries"].append({"isLibrary": library_names})

# Create the single entry for containsLibrary
if contains_library_names:
native_lib_info["nativeLibraries"].append({"containsLibrary": contains_library_names})

return native_lib_info


def match_by_attribute(attribute: str, content: str, patterns_database: Dict) -> List[Dict]:
libs = []
for lib_name, lib_info in patterns_database.items():
if attribute in lib_info:
for pattern in lib_info[attribute]:
if attribute == "filename":
if pattern.lower() == content.lower():
libs.append({"isLibrary": lib_name})

elif attribute == "filecontent":
matches = re.search(pattern.encode("utf-8"), content)
if matches:
libs.append({"containsLibrary": lib_name})
return libs
2 changes: 2 additions & 0 deletions surfactant/plugin/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None:
java_file,
js_file,
mach_o_file,
native_lib_file,
ole_file,
pe_file,
)
Expand Down Expand Up @@ -62,6 +63,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None:
cyclonedx_writer,
spdx_writer,
cytrics_reader,
native_lib_file,
)
for plugin in internal_plugins:
pm.register(plugin)
Expand Down

0 comments on commit bff6a2f

Please sign in to comment.