Skip to content

Commit

Permalink
dynamic: vmray: use monitor IDs to track processes and threads
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-hunhoff committed Sep 25, 2024
1 parent db3c0cb commit ed98697
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 85 deletions.
94 changes: 57 additions & 37 deletions capa/features/extractors/vmray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path
from zipfile import ZipFile
from collections import defaultdict
from dataclasses import dataclass

from capa.exceptions import UnsupportedFormatError
from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict
Expand All @@ -21,6 +22,21 @@
SUPPORTED_FLOG_VERSIONS = ("2",)


@dataclass
class VMRayMonitorThread:
tid: int # thread ID assigned by OS
monitor_id: int # unique ID assigned to thread by VMRay
process_monitor_id: int # unqiue ID assigned to containing process by VMRay


@dataclass
class VMRayMonitorProcess:
pid: int # process ID assigned by OS
ppid: int # parent process ID assigned by OS
monitor_id: int # unique ID assigned to process by VMRay
image_name: str


class VMRayAnalysis:
def __init__(self, zipfile_path: Path):
self.zipfile = ZipFile(zipfile_path, "r")
Expand All @@ -45,9 +61,11 @@ def __init__(self, zipfile_path: Path):
self.exports: Dict[int, str] = {}
self.imports: Dict[int, Tuple[str, str]] = {}
self.sections: Dict[int, str] = {}
self.os_pid_by_monitor_id: Dict[int, int] = {}
self.tids_by_pid: Dict[int, List[int]] = defaultdict(list)
self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
self.monitor_processes: Dict[int, VMRayMonitorProcess] = {}
self.monitor_threads: Dict[int, VMRayMonitorThread] = {}
self.monitor_threads_by_monitor_process: Dict[int, List[int]] = defaultdict(list)
self.monitor_process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))

self.base_address: int

self.sample_file_name: Optional[str] = None
Expand Down Expand Up @@ -79,13 +97,14 @@ def __init__(self, zipfile_path: Path):

self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)

# do not change order, it matters
self._compute_base_address()
self._compute_imports()
self._compute_exports()
self._compute_sections()
self._compute_process_ids()
self._compute_process_threads()
self._compute_process_calls()
self._compute_monitor_processes()
self._compute_monitor_threads()
self._compute_monitor_process_calls()

def _find_sample_file(self):
for file_name, file_analysis in self.sv2.files.items():
Expand Down Expand Up @@ -128,41 +147,42 @@ def _compute_sections(self):
for elffile_section in self.sample_file_static_data.elf.sections:
self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name

def _compute_process_ids(self):
def _compute_monitor_processes(self):
for process in self.sv2.processes.values():
# we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused
assert process.monitor_id not in self.os_pid_by_monitor_id.keys()
self.os_pid_by_monitor_id[process.monitor_id] = process.os_pid
# we expect monitor IDs to be unique
assert process.monitor_id not in self.monitor_processes.keys()

# not all processes may get an ID, get missing data from flog.xml, see #2394
ppid: int = (
self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
)
self.monitor_processes[process.monitor_id] = VMRayMonitorProcess(
process.os_pid, ppid, process.monitor_id, process.image_name
)

# not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394
for monitor_process in self.flog.analysis.monitor_processes:
if monitor_process.process_id in self.os_pid_by_monitor_id:
assert self.os_pid_by_monitor_id[monitor_process.process_id] == monitor_process.os_pid
else:
self.os_pid_by_monitor_id[monitor_process.process_id] = monitor_process.os_pid

def _compute_process_threads(self):
# logs/flog.xml appears to be the only file that contains thread-related data
# so we use it here to map processes to threads
for function_call in self.flog.analysis.function_calls:
pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID
tid: int = function_call.thread_id
if monitor_process.process_id not in self.monitor_processes.keys():
self.monitor_processes[monitor_process.process_id] = VMRayMonitorProcess(
monitor_process.os_pid,
monitor_process.os_parent_pid,
monitor_process.process_id,
monitor_process.image_name,
)

def _compute_monitor_threads(self):
for monitor_thread in self.flog.analysis.monitor_threads:
# we expect monitor IDs to be unique
assert monitor_thread.thread_id not in self.monitor_threads.keys()

self.monitor_threads[monitor_thread.thread_id] = VMRayMonitorThread(
monitor_thread.os_tid, monitor_thread.thread_id, monitor_thread.process_id
)

assert isinstance(pid, int)
assert isinstance(tid, int)
# we expect 1 monitor thread per monitor process
assert monitor_thread.thread_id not in self.monitor_threads_by_monitor_process[monitor_thread.thread_id]

if tid not in self.tids_by_pid[pid]:
self.tids_by_pid[pid].append(tid)
self.monitor_threads_by_monitor_process[monitor_thread.process_id].append(monitor_thread.thread_id)

def _compute_process_calls(self):
def _compute_monitor_process_calls(self):
for function_call in self.flog.analysis.function_calls:
pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID
tid: int = function_call.thread_id

assert isinstance(pid, int)
assert isinstance(tid, int)

self.process_calls[pid][tid].append(function_call)

def get_process_os_pid(self, monitor_id: int) -> int:
return self.os_pid_by_monitor_id[monitor_id]
self.monitor_process_calls[function_call.process_id][function_call.thread_id].append(function_call)
33 changes: 23 additions & 10 deletions capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,16 @@
import capa.features.extractors.vmray.file
import capa.features.extractors.vmray.global_
from capa.features.common import Feature, Characteristic
from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
from capa.features.extractors.vmray import VMRayAnalysis
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall
from capa.features.address import (
NO_ADDRESS,
Address,
ThreadAddress,
ProcessAddress,
DynamicCallAddress,
AbsoluteVirtualAddress,
)
from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall
from capa.features.extractors.base_extractor import (
CallHandle,
SampleHashes,
Expand Down Expand Up @@ -69,20 +76,26 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
yield from self.global_features

def get_processes(self) -> Iterator[ProcessHandle]:
yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
for monitor_process_id in self.analysis.monitor_processes:
monitor_process: VMRayMonitorProcess = self.analysis.monitor_processes[monitor_process_id]

address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
yield ProcessHandle(address, inner=monitor_process)

def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
# we have not identified process-specific features for VMRay yet
yield from []

def get_process_name(self, ph) -> str:
process: Process = ph.inner
return process.image_name
monitor_process: VMRayMonitorProcess = ph.inner
return monitor_process.image_name

def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
for thread in self.analysis.tids_by_pid[ph.address.pid]:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
yield ThreadHandle(address=address, inner={})
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]

address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
yield ThreadHandle(address=address, inner=monitor_thread)

def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
if False:
Expand All @@ -92,7 +105,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat
return

def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]:
for function_call in self.analysis.monitor_process_calls[ph.inner.monitor_id][th.inner.monitor_id]:
addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id)
yield CallHandle(address=addr, inner=function_call)

Expand Down
23 changes: 2 additions & 21 deletions capa/features/extractors/vmray/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,18 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import Dict, Tuple, Iterator
from typing import Tuple, Iterator

import capa.features.extractors.common
from capa.features.file import Export, Import, Section
from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.vmray import VMRayAnalysis
from capa.features.extractors.helpers import generate_symbols
from capa.features.extractors.vmray.models import Process
from capa.features.extractors.base_extractor import ProcessHandle

logger = logging.getLogger(__name__)


def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
processes: Dict[str, Process] = analysis.sv2.processes

for process in processes.values():
# we map VMRay's monitor ID to the OS PID to make it easier for users
# to follow the processes in capa's output
pid: int = analysis.get_process_os_pid(process.monitor_id)
ppid: int = (
analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id)
if process.ref_parent_process
else 0
)

addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid)
yield ProcessHandle(address=addr, inner=process)


def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
for addr, name in analysis.exports.items():
yield Export(name), AbsoluteVirtualAddress(addr)
Expand Down
9 changes: 9 additions & 0 deletions capa/features/extractors/vmray/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,16 @@ class MonitorProcess(BaseModel):
# os_groups: str


class MonitorThread(BaseModel):
ts: HexInt
thread_id: int
process_id: int
os_tid: HexInt


# handle if there's only single entries, but the model expects a list
MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)]
MonitorThreadList = Annotated[List[MonitorThread], BeforeValidator(validate_ensure_is_list)]
FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)]


Expand All @@ -167,6 +175,7 @@ class Analysis(BaseModel):
# analysis_date: str

monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[])
monitor_threads: MonitorThreadList = Field(alias="monitor_thread", default=[])
function_calls: FunctionCallList = Field(alias="fncall", default=[])
# function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])

Expand Down
34 changes: 17 additions & 17 deletions tests/test_vmray_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,21 @@
# file/imports
("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), True),
# thread/api calls
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), True),
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("DoesNotExist"), False),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), True),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("DoesNotExist"), False),
# call/api
("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
# call/string argument
(
"93b2d1-vmray",
"process=(2176:0),thread=7,call=10323",
"process=(2176:0),thread=2420,call=10323",
capa.features.common.String("raw.githubusercontent.com"),
True,
),
# call/number argument
# VirtualAlloc(4096, 4)
("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4096), True),
("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4), True),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4096), True),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4), True),
],
# order tests by (file, item)
# so that our LRU cache is most effective.
Expand All @@ -46,24 +46,24 @@
# file/imports
("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), 1),
# thread/api calls
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), 5),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), 5),
# call/api
("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
# call/string argument
(
"93b2d1-vmray",
"process=(2176:0),thread=7,call=10323",
"process=(2176:0),thread=2420,call=10323",
capa.features.common.String("raw.githubusercontent.com"),
1,
),
("93b2d1-vmray", "process=(2176:0),thread=7,call=10323", capa.features.common.String("non_existant"), 0),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10323", capa.features.common.String("non_existant"), 0),
# call/number argument
("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4096), 1),
("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4), 1),
("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(404), 0),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4096), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(404), 0),
],
# order tests by (file, item)
# so that our LRU cache is most effective.
Expand Down Expand Up @@ -93,4 +93,4 @@ def test_vmray_processes():
# see #2394
path = fixtures.get_data_path_by_name("2f8a79-vmray")
vmre = fixtures.get_vmray_extractor(path)
assert len(vmre.analysis.os_pid_by_monitor_id) == 9
assert len(vmre.analysis.monitor_processes) == 9

0 comments on commit ed98697

Please sign in to comment.