Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix VMRay missing process data #2396

Merged
merged 13 commits into from
Sep 26, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

- use Python 3.12 to build extra standalone build on Linux #2383 @williballenthin
- bump minimum Python version to 3.8.1 to satisfy uv #2387 @williballenthin
- collect more process information from flog.xml #2394 @mr-tz
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

### capa explorer IDA Pro plugin

Expand Down
21 changes: 14 additions & 7 deletions capa/features/extractors/vmray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def __init__(self, zipfile_path: Path):
self.exports: Dict[int, str] = {}
self.imports: Dict[int, Tuple[str, str]] = {}
self.sections: Dict[int, str] = {}
self.process_ids: Dict[int, int] = {}
self.process_threads: Dict[int, List[int]] = defaultdict(list)
self.os_pid_by_monitor_id: Dict[int, int] = {}
self.tids_by_pid: Dict[int, List[int]] = defaultdict(list)
self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
self.base_address: int

Expand Down Expand Up @@ -131,8 +131,15 @@ def _compute_sections(self):
def _compute_process_ids(self):
for process in self.sv2.processes.values():
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
# we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused
assert process.monitor_id not in self.process_ids.keys()
self.process_ids[process.monitor_id] = process.os_pid
assert process.monitor_id not in self.os_pid_by_monitor_id.keys()
self.os_pid_by_monitor_id[process.monitor_id] = process.os_pid

# not all processes may get an ID, get missing data from flog.xml, see #2394
for monitor_process in self.flog.analysis.monitor_processes:
if monitor_process.process_id in self.os_pid_by_monitor_id:
assert self.os_pid_by_monitor_id[monitor_process.process_id] == monitor_process.os_pid
else:
self.os_pid_by_monitor_id[monitor_process.process_id] = monitor_process.os_pid

def _compute_process_threads(self):
# logs/flog.xml appears to be the only file that contains thread-related data
Expand All @@ -144,8 +151,8 @@ def _compute_process_threads(self):
assert isinstance(pid, int)
assert isinstance(tid, int)

if tid not in self.process_threads[pid]:
self.process_threads[pid].append(tid)
if tid not in self.tids_by_pid[pid]:
self.tids_by_pid[pid].append(tid)

def _compute_process_calls(self):
for function_call in self.flog.analysis.function_calls:
Expand All @@ -158,4 +165,4 @@ def _compute_process_calls(self):
self.process_calls[pid][tid].append(function_call)

def get_process_os_pid(self, monitor_id: int) -> int:
return self.process_ids[monitor_id]
return self.os_pid_by_monitor_id[monitor_id]
2 changes: 1 addition & 1 deletion capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def get_process_name(self, ph) -> str:
return process.image_name

def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
for thread in self.analysis.process_threads[ph.address.pid]:
for thread in self.analysis.tids_by_pid[ph.address.pid]:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
yield ThreadHandle(address=address, inner={})

Expand Down
31 changes: 28 additions & 3 deletions capa/features/extractors/vmray/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class Param(BaseModel):
deref: Optional[ParamDeref] = None


def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
def validate_ensure_is_list(value: Union[List[Param], Param]) -> List[Param]:
if isinstance(value, list):
return value
else:
Expand All @@ -97,7 +97,7 @@ def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
# params may be stored as a list of Param or a single Param so we convert
# the input value to Python list type before the inner validation (List[Param])
# is called
ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
ParamList = Annotated[List[Param], BeforeValidator(validate_ensure_is_list)]


class Params(BaseModel):
Expand Down Expand Up @@ -137,12 +137,37 @@ class FunctionReturn(BaseModel):
from_addr: HexInt = Field(alias="from")


class MonitorProcess(BaseModel):
ts: HexInt
process_id: int
image_name: str
filename: str
# page_root: HexInt
os_pid: HexInt
# os_integrity_level: HexInt
# os_privileges: HexInt
monitor_reason: str
parent_id: int
os_parent_pid: HexInt
# cmd_line: str
# cur_dir: str
# os_username: str
# bitness: int
# os_groups: str


# handle if there's only single entries, but the model expects a list
MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)]
FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)]


class Analysis(BaseModel):
log_version: str # tested 2
analyzer_version: str # tested 2024.2.1
# analysis_date: str

function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[])
function_calls: FunctionCallList = Field(alias="fncall", default=[])
# function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])


Expand Down
8 changes: 8 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,14 @@ def get_data_path_by_name(name) -> Path:
/ "vmray"
/ "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_min_archive.zip"
)
elif name.startswith("2f8a79-vmray"):
return (
CD
/ "data"
/ "dynamic"
/ "vmray"
/ "2f8a79b12a7a989ac7e5f6ec65050036588a92e65aeb6841e08dc228ff0e21b4_min_archive.zip"
)
elif name.startswith("ea2876"):
return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_"
elif name.startswith("1038a2"):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_vmray_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,10 @@ def test_vmray_features(sample, scope, feature, expected):
)
def test_vmray_feature_counts(sample, scope, feature, expected):
fixtures.do_test_feature_count(fixtures.get_vmray_extractor, sample, scope, feature, expected)


def test_vmray_processes():
# see #2394
path = fixtures.get_data_path_by_name("2f8a79-vmray")
vmre = fixtures.get_vmray_extractor(path)
assert len(vmre.analysis.os_pid_by_monitor_id) == 9