Skip to content

Commit

Permalink
replace tqdm, termcolor, tabulate with rich (#2374)
Browse files Browse the repository at this point in the history
* logging: use rich handler for logging

* tqdm: remove unneeded redirecting_print_to_tqdm function

* tqdm: introduce `CapaProgressBar` rich `Progress` bar

* tqdm: replace tqdm with rich Progress bar

* tqdm: remove tqdm dependency

* termcolor: replace termcolor and update `scripts/`

* tests: update `test_render.py` to use rich.console.Console

* termcolor: remove termcolor dependency

* capa.render.utils: add `write` & `writeln` methods to subclass `Console`

* update markup util functions to use fmt strings

* tests: update `test_render.py` to use `capa.render.utils.Console`

* replace kwarg `end=""` with `write` and `writeln` methods

* tabulate: replace tabulate with `rich.table`

* tabulate: remove `tabulate` and its dependency `wcwidth`

* logging: handle logging in `capa.main`

* logging: set up logging in `capa.main`

this commit sets up logging in `capa.main` and uses a shared
`log_console` in `capa.helpers` for logging purposes

* changelog: replace packages with rich

* remove entry from pyinstaller and unneeded progress.update call

* update requirements.txt

* scripts: use `capa.helpers.log_console` in `CapaProgressBar`

* logging: configure root logger to use `RichHandler`

* remove unused import `inspect`
  • Loading branch information
fariss authored Sep 27, 2024
1 parent 558bf0f commit 51a4eb4
Show file tree
Hide file tree
Showing 16 changed files with 898 additions and 814 deletions.
3 changes: 0 additions & 3 deletions .github/mypy/mypy.ini
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
[mypy]

[mypy-tqdm.*]
ignore_missing_imports = True

[mypy-ruamel.*]
ignore_missing_imports = True

Expand Down
13 changes: 0 additions & 13 deletions .github/pyinstaller/pyinstaller.spec
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
import sys

import wcwidth
import capa.rules.cache

from pathlib import Path
Expand All @@ -29,13 +28,6 @@ a = Analysis(
("../../rules", "rules"),
("../../sigs", "sigs"),
("../../cache", "cache"),
# capa.render.default uses tabulate that depends on wcwidth.
# it seems wcwidth uses a json file `version.json`
# and this doesn't get picked up by pyinstaller automatically.
# so we manually embed the wcwidth resources here.
#
# ref: https://stackoverflow.com/a/62278462/87207
(Path(wcwidth.__file__).parent, "wcwidth"),
],
# when invoking pyinstaller from the project root,
# this gets run from the project root.
Expand All @@ -48,11 +40,6 @@ a = Analysis(
"tkinter",
"_tkinter",
"Tkinter",
# tqdm provides renderers for ipython,
# however, this drags in a lot of dependencies.
# since we don't spawn a notebook, we can safely remove these.
"IPython",
"ipywidgets",
# these are pulled in by networkx
# but we don't need to compute the strongly connected components.
"numpy",
Expand Down
946 changes: 501 additions & 445 deletions CHANGELOG.md

Large diffs are not rendered by default.

62 changes: 25 additions & 37 deletions capa/capabilities/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,16 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import logging
import itertools
import collections
from typing import Any, Tuple

import tqdm
from typing import Any, List, Tuple

import capa.perf
import capa.features.freeze as frz
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.helpers import redirecting_print_to_tqdm
from capa.capabilities.common import find_file_capabilities
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor

Expand Down Expand Up @@ -139,38 +135,30 @@ def find_dynamic_capabilities(
feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())

assert isinstance(extractor, DynamicFeatureExtractor)
with redirecting_print_to_tqdm(disable_progress):
with tqdm.contrib.logging.logging_redirect_tqdm():
pbar = tqdm.tqdm
if disable_progress:
# do not use tqdm to avoid unnecessary side effects when caller intends
# to disable progress completely
def pbar(s, *args, **kwargs):
return s

elif not sys.stderr.isatty():
# don't display progress bar when stderr is redirected to a file
def pbar(s, *args, **kwargs):
return s

processes = list(extractor.get_processes())

pb = pbar(processes, desc="matching", unit=" processes", leave=False)
for p in pb:
process_matches, thread_matches, call_matches, feature_count = find_process_capabilities(
ruleset, extractor, p
)
feature_counts.processes += (
rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count),
)
logger.debug("analyzed %s and extracted %d features", p.address, feature_count)

for rule_name, res in process_matches.items():
all_process_matches[rule_name].extend(res)
for rule_name, res in thread_matches.items():
all_thread_matches[rule_name].extend(res)
for rule_name, res in call_matches.items():
all_call_matches[rule_name].extend(res)
processes: List[ProcessHandle] = list(extractor.get_processes())
n_processes: int = len(processes)

with capa.helpers.CapaProgressBar(
console=capa.helpers.log_console, transient=True, disable=disable_progress
) as pbar:
task = pbar.add_task("matching", total=n_processes, unit="processes")
for p in processes:
process_matches, thread_matches, call_matches, feature_count = find_process_capabilities(
ruleset, extractor, p
)
feature_counts.processes += (
rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count),
)
logger.debug("analyzed %s and extracted %d features", p.address, feature_count)

for rule_name, res in process_matches.items():
all_process_matches[rule_name].extend(res)
for rule_name, res in thread_matches.items():
all_thread_matches[rule_name].extend(res)
for rule_name, res in call_matches.items():
all_call_matches[rule_name].extend(res)

pbar.advance(task)

# collection of features that captures the rule matches within process and thread scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
Expand Down
126 changes: 53 additions & 73 deletions capa/capabilities/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,18 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import time
import logging
import itertools
import collections
from typing import Any, Tuple

import tqdm.contrib.logging
from typing import Any, List, Tuple

import capa.perf
import capa.helpers
import capa.features.freeze as frz
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.helpers import redirecting_print_to_tqdm
from capa.capabilities.common import find_file_capabilities
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor

Expand Down Expand Up @@ -143,75 +140,58 @@ def find_static_capabilities(
library_functions: Tuple[rdoc.LibraryFunction, ...] = ()

assert isinstance(extractor, StaticFeatureExtractor)
with redirecting_print_to_tqdm(disable_progress):
with tqdm.contrib.logging.logging_redirect_tqdm():
pbar = tqdm.tqdm
if capa.helpers.is_runtime_ghidra():
# Ghidrathon interpreter cannot properly handle
# the TMonitor thread that is created via a monitor_interval
# > 0
pbar.monitor_interval = 0
if disable_progress:
# do not use tqdm to avoid unnecessary side effects when caller intends
# to disable progress completely
def pbar(s, *args, **kwargs):
return s

elif not sys.stderr.isatty():
# don't display progress bar when stderr is redirected to a file
def pbar(s, *args, **kwargs):
return s

functions = list(extractor.get_functions())
n_funcs = len(functions)

pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions", leave=False)
for f in pb:
t0 = time.time()
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
library_functions += (
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name),
)
n_libs = len(library_functions)
percentage = round(100 * (n_libs / n_funcs))
if isinstance(pb, tqdm.tqdm):
pb.set_postfix_str(f"skipped {n_libs} library functions ({percentage}%)")
continue

function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities(
ruleset, extractor, f
)
feature_counts.functions += (
rdoc.FunctionFeatureCount(address=frz.Address.from_capa(f.address), count=feature_count),
)
t1 = time.time()

match_count = 0
for name, matches_ in itertools.chain(
function_matches.items(), bb_matches.items(), insn_matches.items()
):
# in practice, most matches are derived rules,
# like "check OS version/5bf4c7f39fd4492cbed0f6dc7d596d49"
# but when we log to the human, they really care about "real" rules.
if not ruleset.rules[name].is_subscope_rule():
match_count += len(matches_)

logger.debug(
"analyzed function 0x%x and extracted %d features, %d matches in %0.02fs",
f.address,
feature_count,
match_count,
t1 - t0,
functions: List[FunctionHandle] = list(extractor.get_functions())
n_funcs: int = len(functions)
n_libs: int = 0
percentage: float = 0

with capa.helpers.CapaProgressBar(
console=capa.helpers.log_console, transient=True, disable=disable_progress
) as pbar:
task = pbar.add_task(
"matching", total=n_funcs, unit="functions", postfix=f"skipped {n_libs} library functions, {percentage}%"
)
for f in functions:
t0 = time.time()
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
library_functions += (
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name),
)

for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in bb_matches.items():
all_bb_matches[rule_name].extend(res)
for rule_name, res in insn_matches.items():
all_insn_matches[rule_name].extend(res)
n_libs = len(library_functions)
percentage = round(100 * (n_libs / n_funcs))
pbar.update(task, postfix=f"skipped {n_libs} library functions, {percentage}%")
pbar.advance(task)
continue

function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities(ruleset, extractor, f)
feature_counts.functions += (
rdoc.FunctionFeatureCount(address=frz.Address.from_capa(f.address), count=feature_count),
)
t1 = time.time()

match_count = 0
for name, matches_ in itertools.chain(function_matches.items(), bb_matches.items(), insn_matches.items()):
if not ruleset.rules[name].is_subscope_rule():
match_count += len(matches_)

logger.debug(
"analyzed function 0x%x and extracted %d features, %d matches in %0.02fs",
f.address,
feature_count,
match_count,
t1 - t0,
)

for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in bb_matches.items():
all_bb_matches[rule_name].extend(res)
for rule_name, res in insn_matches.items():
all_insn_matches[rule_name].extend(res)

pbar.advance(task)

# collection of features that captures the rule matches within function, BB, and instruction scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
Expand Down
Loading

0 comments on commit 51a4eb4

Please sign in to comment.