diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index f7b3e48cc..64080febc 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -10,6 +10,7 @@ from pathlib import Path from zipfile import ZipFile from collections import defaultdict +from dataclasses import dataclass from capa.exceptions import UnsupportedFormatError from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict @@ -21,6 +22,21 @@ SUPPORTED_FLOG_VERSIONS = ("2",) +@dataclass +class VMRayMonitorThread: + tid: int # thread ID assigned by OS + monitor_id: int # unique ID assigned to thread by VMRay + process_monitor_id: int # unqiue ID assigned to containing process by VMRay + + +@dataclass +class VMRayMonitorProcess: + pid: int # process ID assigned by OS + ppid: int # parent process ID assigned by OS + monitor_id: int # unique ID assigned to process by VMRay + image_name: str + + class VMRayAnalysis: def __init__(self, zipfile_path: Path): self.zipfile = ZipFile(zipfile_path, "r") @@ -45,9 +61,11 @@ def __init__(self, zipfile_path: Path): self.exports: Dict[int, str] = {} self.imports: Dict[int, Tuple[str, str]] = {} self.sections: Dict[int, str] = {} - self.os_pid_by_monitor_id: Dict[int, int] = {} - self.tids_by_pid: Dict[int, List[int]] = defaultdict(list) - self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list)) + self.monitor_processes: Dict[int, VMRayMonitorProcess] = {} + self.monitor_threads: Dict[int, VMRayMonitorThread] = {} + self.monitor_threads_by_monitor_process: Dict[int, List[int]] = defaultdict(list) + self.monitor_process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list)) + self.base_address: int self.sample_file_name: Optional[str] = None @@ -79,13 +97,14 @@ def __init__(self, zipfile_path: Path): self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD) + # do not change order, it matters self._compute_base_address() self._compute_imports() self._compute_exports() self._compute_sections() - self._compute_process_ids() - self._compute_process_threads() - self._compute_process_calls() + self._compute_monitor_processes() + self._compute_monitor_threads() + self._compute_monitor_process_calls() def _find_sample_file(self): for file_name, file_analysis in self.sv2.files.items(): @@ -128,41 +147,42 @@ def _compute_sections(self): for elffile_section in self.sample_file_static_data.elf.sections: self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name - def _compute_process_ids(self): + def _compute_monitor_processes(self): for process in self.sv2.processes.values(): - # we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused - assert process.monitor_id not in self.os_pid_by_monitor_id.keys() - self.os_pid_by_monitor_id[process.monitor_id] = process.os_pid + # we expect monitor IDs to be unique + assert process.monitor_id not in self.monitor_processes.keys() - # not all processes may get an ID, get missing data from flog.xml, see #2394 + ppid: int = ( + self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0 + ) + self.monitor_processes[process.monitor_id] = VMRayMonitorProcess( + process.os_pid, ppid, process.monitor_id, process.image_name + ) + + # not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394 for monitor_process in self.flog.analysis.monitor_processes: - if monitor_process.process_id in self.os_pid_by_monitor_id: - assert self.os_pid_by_monitor_id[monitor_process.process_id] == monitor_process.os_pid - else: - self.os_pid_by_monitor_id[monitor_process.process_id] = monitor_process.os_pid - - def _compute_process_threads(self): - # logs/flog.xml appears to be the only file that contains thread-related data - # so we use it here to map processes to threads - for function_call in self.flog.analysis.function_calls: - pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID - tid: int = function_call.thread_id + if monitor_process.process_id not in self.monitor_processes.keys(): + self.monitor_processes[monitor_process.process_id] = VMRayMonitorProcess( + monitor_process.os_pid, + monitor_process.os_parent_pid, + monitor_process.process_id, + monitor_process.image_name, + ) + + def _compute_monitor_threads(self): + for monitor_thread in self.flog.analysis.monitor_threads: + # we expect monitor IDs to be unique + assert monitor_thread.thread_id not in self.monitor_threads.keys() + + self.monitor_threads[monitor_thread.thread_id] = VMRayMonitorThread( + monitor_thread.os_tid, monitor_thread.thread_id, monitor_thread.process_id + ) - assert isinstance(pid, int) - assert isinstance(tid, int) + # we expect 1 monitor thread per monitor process + assert monitor_thread.thread_id not in self.monitor_threads_by_monitor_process[monitor_thread.thread_id] - if tid not in self.tids_by_pid[pid]: - self.tids_by_pid[pid].append(tid) + self.monitor_threads_by_monitor_process[monitor_thread.process_id].append(monitor_thread.thread_id) - def _compute_process_calls(self): + def _compute_monitor_process_calls(self): for function_call in self.flog.analysis.function_calls: - pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID - tid: int = function_call.thread_id - - assert isinstance(pid, int) - assert isinstance(tid, int) - - self.process_calls[pid][tid].append(function_call) - - def get_process_os_pid(self, monitor_id: int) -> int: - return self.os_pid_by_monitor_id[monitor_id] + self.monitor_process_calls[function_call.process_id][function_call.thread_id].append(function_call) diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index 9a8360138..7db8688f2 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -15,9 +15,16 @@ import capa.features.extractors.vmray.file import capa.features.extractors.vmray.global_ from capa.features.common import Feature, Characteristic -from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress -from capa.features.extractors.vmray import VMRayAnalysis -from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall +from capa.features.address import ( + NO_ADDRESS, + Address, + ThreadAddress, + ProcessAddress, + DynamicCallAddress, + AbsoluteVirtualAddress, +) +from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess +from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall from capa.features.extractors.base_extractor import ( CallHandle, SampleHashes, @@ -69,20 +76,26 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features def get_processes(self) -> Iterator[ProcessHandle]: - yield from capa.features.extractors.vmray.file.get_processes(self.analysis) + for monitor_process_id in self.analysis.monitor_processes: + monitor_process: VMRayMonitorProcess = self.analysis.monitor_processes[monitor_process_id] + + address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid) + yield ProcessHandle(address, inner=monitor_process) def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: # we have not identified process-specific features for VMRay yet yield from [] def get_process_name(self, ph) -> str: - process: Process = ph.inner - return process.image_name + monitor_process: VMRayMonitorProcess = ph.inner + return monitor_process.image_name def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: - for thread in self.analysis.tids_by_pid[ph.address.pid]: - address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread) - yield ThreadHandle(address=address, inner={}) + for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]: + monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id] + + address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid) + yield ThreadHandle(address=address, inner=monitor_thread) def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: if False: @@ -92,7 +105,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat return def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]: - for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]: + for function_call in self.analysis.monitor_process_calls[ph.inner.monitor_id][th.inner.monitor_id]: addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id) yield CallHandle(address=addr, inner=function_call) diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py index 38ac9db01..7f4ba0395 100644 --- a/capa/features/extractors/vmray/file.py +++ b/capa/features/extractors/vmray/file.py @@ -6,37 +6,18 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging -from typing import Dict, Tuple, Iterator +from typing import Tuple, Iterator import capa.features.extractors.common from capa.features.file import Export, Import, Section from capa.features.common import String, Feature -from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress from capa.features.extractors.vmray import VMRayAnalysis from capa.features.extractors.helpers import generate_symbols -from capa.features.extractors.vmray.models import Process -from capa.features.extractors.base_extractor import ProcessHandle logger = logging.getLogger(__name__) -def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]: - processes: Dict[str, Process] = analysis.sv2.processes - - for process in processes.values(): - # we map VMRay's monitor ID to the OS PID to make it easier for users - # to follow the processes in capa's output - pid: int = analysis.get_process_os_pid(process.monitor_id) - ppid: int = ( - analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id) - if process.ref_parent_process - else 0 - ) - - addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid) - yield ProcessHandle(address=addr, inner=process) - - def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]: for addr, name in analysis.exports.items(): yield Export(name), AbsoluteVirtualAddress(addr) diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index 0a0c58859..f5371bec1 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -156,8 +156,16 @@ class MonitorProcess(BaseModel): # os_groups: str +class MonitorThread(BaseModel): + ts: HexInt + thread_id: int + process_id: int + os_tid: HexInt + + # handle if there's only single entries, but the model expects a list MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)] +MonitorThreadList = Annotated[List[MonitorThread], BeforeValidator(validate_ensure_is_list)] FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)] @@ -167,6 +175,7 @@ class Analysis(BaseModel): # analysis_date: str monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[]) + monitor_threads: MonitorThreadList = Field(alias="monitor_thread", default=[]) function_calls: FunctionCallList = Field(alias="fncall", default=[]) # function_returns: List[FunctionReturn] = Field(alias="fnret", default=[]) diff --git a/tests/test_vmray_features.py b/tests/test_vmray_features.py index 0fd338eaf..02eb683ec 100644 --- a/tests/test_vmray_features.py +++ b/tests/test_vmray_features.py @@ -20,21 +20,21 @@ # file/imports ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), True), # thread/api calls - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), True), - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("DoesNotExist"), False), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("DoesNotExist"), False), # call/api - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), True), # call/string argument ( "93b2d1-vmray", - "process=(2176:0),thread=7,call=10323", + "process=(2176:0),thread=2420,call=10323", capa.features.common.String("raw.githubusercontent.com"), True, ), # call/number argument # VirtualAlloc(4096, 4) - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4096), True), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4096), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4), True), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -46,24 +46,24 @@ # file/imports ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), 1), # thread/api calls - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("free"), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), 5), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("free"), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), 5), # call/api - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("free"), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("GetAddrInfoW"), 0), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("free"), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("GetAddrInfoW"), 0), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), 1), # call/string argument ( "93b2d1-vmray", - "process=(2176:0),thread=7,call=10323", + "process=(2176:0),thread=2420,call=10323", capa.features.common.String("raw.githubusercontent.com"), 1, ), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10323", capa.features.common.String("non_existant"), 0), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10323", capa.features.common.String("non_existant"), 0), # call/number argument - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4096), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(404), 0), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4096), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(404), 0), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -93,4 +93,4 @@ def test_vmray_processes(): # see #2394 path = fixtures.get_data_path_by_name("2f8a79-vmray") vmre = fixtures.get_vmray_extractor(path) - assert len(vmre.analysis.os_pid_by_monitor_id) == 9 + assert len(vmre.analysis.monitor_processes) == 9