Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace tqdm, termcolor, tabulate with rich #2374

Merged
merged 22 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
41e2223
logging: use rich handler for logging
fariss Sep 15, 2024
6669a62
tqdm: remove unneeded redirecting_print_to_tqdm function
fariss Sep 15, 2024
ada9246
tqdm: introduce `CapaProgressBar` rich `Progress` bar
fariss Sep 15, 2024
727e657
tqdm: replace tqdm with rich Progress bar
fariss Sep 16, 2024
a800698
tqdm: remove tqdm dependency
fariss Sep 16, 2024
6a3d4e7
termcolor: replace termcolor and update `scripts/`
fariss Sep 17, 2024
6abe18b
tests: update `test_render.py` to use rich.console.Console
fariss Sep 17, 2024
807b215
termcolor: remove termcolor dependency
fariss Sep 17, 2024
c0a6ab9
capa.render.utils: add `write` & `writeln` methods to subclass `Console`
fariss Sep 18, 2024
75b675c
update markup util functions to use fmt strings
fariss Sep 18, 2024
13c37ae
tests: update `test_render.py` to use `capa.render.utils.Console`
fariss Sep 18, 2024
c28a6b6
replace kwarg `end=""` with `write` and `writeln` methods
fariss Sep 18, 2024
dd41874
tabulate: replace tabulate with `rich.table`
fariss Sep 19, 2024
688869a
tabulate: remove `tabulate` and its dependency `wcwidth`
fariss Sep 19, 2024
2218db7
logging: handle logging in `capa.main`
fariss Sep 19, 2024
95e806a
logging: set up logging in `capa.main`
fariss Sep 24, 2024
5770ade
changelog: replace packages with rich
fariss Sep 24, 2024
1024c49
remove entry from pyinstaller and unneeded progress.update call
fariss Sep 24, 2024
4b91652
update requirements.txt
fariss Sep 24, 2024
596a5bb
scripts: use `capa.helpers.log_console` in `CapaProgressBar`
fariss Sep 26, 2024
33249f5
logging: configure root logger to use `RichHandler`
fariss Sep 26, 2024
b9e5841
remove unused import `inspect`
fariss Sep 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .github/mypy/mypy.ini
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
[mypy]

[mypy-tqdm.*]
ignore_missing_imports = True

[mypy-ruamel.*]
ignore_missing_imports = True

Expand Down
13 changes: 0 additions & 13 deletions .github/pyinstaller/pyinstaller.spec
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
import sys

import wcwidth
import capa.rules.cache

from pathlib import Path
Expand All @@ -29,13 +28,6 @@ a = Analysis(
("../../rules", "rules"),
("../../sigs", "sigs"),
("../../cache", "cache"),
# capa.render.default uses tabulate that depends on wcwidth.
# it seems wcwidth uses a json file `version.json`
# and this doesn't get picked up by pyinstaller automatically.
# so we manually embed the wcwidth resources here.
#
# ref: https://stackoverflow.com/a/62278462/87207
(Path(wcwidth.__file__).parent, "wcwidth"),
],
# when invoking pyinstaller from the project root,
# this gets run from the project root.
Expand All @@ -48,11 +40,6 @@ a = Analysis(
"tkinter",
"_tkinter",
"Tkinter",
# tqdm provides renderers for ipython,
# however, this drags in a lot of dependencies.
# since we don't spawn a notebook, we can safely remove these.
"IPython",
"ipywidgets",
# these are pulled in by networkx
# but we don't need to compute the strongly connected components.
"numpy",
Expand Down
946 changes: 501 additions & 445 deletions CHANGELOG.md

Large diffs are not rendered by default.

62 changes: 25 additions & 37 deletions capa/capabilities/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,16 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import logging
import itertools
import collections
from typing import Any, Tuple

import tqdm
from typing import Any, List, Tuple

import capa.perf
import capa.features.freeze as frz
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.helpers import redirecting_print_to_tqdm
from capa.capabilities.common import find_file_capabilities
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor

Expand Down Expand Up @@ -139,38 +135,30 @@ def find_dynamic_capabilities(
feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())

assert isinstance(extractor, DynamicFeatureExtractor)
with redirecting_print_to_tqdm(disable_progress):
with tqdm.contrib.logging.logging_redirect_tqdm():
pbar = tqdm.tqdm
if disable_progress:
# do not use tqdm to avoid unnecessary side effects when caller intends
# to disable progress completely
def pbar(s, *args, **kwargs):
return s

elif not sys.stderr.isatty():
# don't display progress bar when stderr is redirected to a file
def pbar(s, *args, **kwargs):
return s

processes = list(extractor.get_processes())

pb = pbar(processes, desc="matching", unit=" processes", leave=False)
for p in pb:
process_matches, thread_matches, call_matches, feature_count = find_process_capabilities(
ruleset, extractor, p
)
feature_counts.processes += (
rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count),
)
logger.debug("analyzed %s and extracted %d features", p.address, feature_count)

for rule_name, res in process_matches.items():
all_process_matches[rule_name].extend(res)
for rule_name, res in thread_matches.items():
all_thread_matches[rule_name].extend(res)
for rule_name, res in call_matches.items():
all_call_matches[rule_name].extend(res)
processes: List[ProcessHandle] = list(extractor.get_processes())
n_processes: int = len(processes)

with capa.helpers.CapaProgressBar(
console=capa.helpers.log_console, transient=True, disable=disable_progress
) as pbar:
task = pbar.add_task("matching", total=n_processes, unit="processes")
for p in processes:
process_matches, thread_matches, call_matches, feature_count = find_process_capabilities(
ruleset, extractor, p
)
feature_counts.processes += (
rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count),
)
logger.debug("analyzed %s and extracted %d features", p.address, feature_count)

for rule_name, res in process_matches.items():
all_process_matches[rule_name].extend(res)
for rule_name, res in thread_matches.items():
all_thread_matches[rule_name].extend(res)
for rule_name, res in call_matches.items():
all_call_matches[rule_name].extend(res)

pbar.advance(task)

# collection of features that captures the rule matches within process and thread scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
Expand Down
126 changes: 53 additions & 73 deletions capa/capabilities/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,18 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import time
import logging
import itertools
import collections
from typing import Any, Tuple

import tqdm.contrib.logging
from typing import Any, List, Tuple

import capa.perf
import capa.helpers
import capa.features.freeze as frz
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.helpers import redirecting_print_to_tqdm
from capa.capabilities.common import find_file_capabilities
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor

Expand Down Expand Up @@ -143,75 +140,58 @@ def find_static_capabilities(
library_functions: Tuple[rdoc.LibraryFunction, ...] = ()

assert isinstance(extractor, StaticFeatureExtractor)
with redirecting_print_to_tqdm(disable_progress):
with tqdm.contrib.logging.logging_redirect_tqdm():
pbar = tqdm.tqdm
if capa.helpers.is_runtime_ghidra():
# Ghidrathon interpreter cannot properly handle
# the TMonitor thread that is created via a monitor_interval
# > 0
Comment on lines -150 to -152
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before merge we'll have to test and ensure the new implementation also works under Ghidra, which has some weird restrictions.

pbar.monitor_interval = 0
if disable_progress:
# do not use tqdm to avoid unnecessary side effects when caller intends
# to disable progress completely
def pbar(s, *args, **kwargs):
return s

elif not sys.stderr.isatty():
# don't display progress bar when stderr is redirected to a file
def pbar(s, *args, **kwargs):
return s

functions = list(extractor.get_functions())
n_funcs = len(functions)

pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions", leave=False)
for f in pb:
t0 = time.time()
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
library_functions += (
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name),
)
n_libs = len(library_functions)
percentage = round(100 * (n_libs / n_funcs))
if isinstance(pb, tqdm.tqdm):
pb.set_postfix_str(f"skipped {n_libs} library functions ({percentage}%)")
continue

function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities(
ruleset, extractor, f
)
feature_counts.functions += (
rdoc.FunctionFeatureCount(address=frz.Address.from_capa(f.address), count=feature_count),
)
t1 = time.time()

match_count = 0
for name, matches_ in itertools.chain(
function_matches.items(), bb_matches.items(), insn_matches.items()
):
# in practice, most matches are derived rules,
# like "check OS version/5bf4c7f39fd4492cbed0f6dc7d596d49"
# but when we log to the human, they really care about "real" rules.
if not ruleset.rules[name].is_subscope_rule():
match_count += len(matches_)

logger.debug(
"analyzed function 0x%x and extracted %d features, %d matches in %0.02fs",
f.address,
feature_count,
match_count,
t1 - t0,
functions: List[FunctionHandle] = list(extractor.get_functions())
n_funcs: int = len(functions)
n_libs: int = 0
percentage: float = 0

with capa.helpers.CapaProgressBar(
console=capa.helpers.log_console, transient=True, disable=disable_progress
) as pbar:
task = pbar.add_task(
"matching", total=n_funcs, unit="functions", postfix=f"skipped {n_libs} library functions, {percentage}%"
)
for f in functions:
t0 = time.time()
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
library_functions += (
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name),
)

for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in bb_matches.items():
all_bb_matches[rule_name].extend(res)
for rule_name, res in insn_matches.items():
all_insn_matches[rule_name].extend(res)
n_libs = len(library_functions)
percentage = round(100 * (n_libs / n_funcs))
pbar.update(task, postfix=f"skipped {n_libs} library functions, {percentage}%")
pbar.advance(task)
continue

function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities(ruleset, extractor, f)
feature_counts.functions += (
rdoc.FunctionFeatureCount(address=frz.Address.from_capa(f.address), count=feature_count),
)
t1 = time.time()

match_count = 0
for name, matches_ in itertools.chain(function_matches.items(), bb_matches.items(), insn_matches.items()):
if not ruleset.rules[name].is_subscope_rule():
match_count += len(matches_)

logger.debug(
"analyzed function 0x%x and extracted %d features, %d matches in %0.02fs",
f.address,
feature_count,
match_count,
t1 - t0,
)

for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in bb_matches.items():
all_bb_matches[rule_name].extend(res)
for rule_name, res in insn_matches.items():
all_insn_matches[rule_name].extend(res)

pbar.advance(task)

# collection of features that captures the rule matches within function, BB, and instruction scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
Expand Down
Loading