Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

made nessus subfolder and added script for fixing container license #320

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions github/add_security_md_to_json_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def add_commit_push_security_md(repo_path, branch_name):

# Create a pull request
def create_pull_request(repo_path, branch_name, default_branch):
"""Create a pull request for the branch, attempt to add reviewers, and assign 'wz-gsa'."""
""f"Create a pull request for the branch, attempt to add reviewers, and assign '{ASSIGNEE}'."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is ASSIGNEE being set?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the f in the middle of the quotes, you no longer have a triple-quoted string, you have three strings being silently concatenated.

This should probably be either:
f"""Create a pull request for the branch, attempt to add reviewers, and assign '{ASSIGNEE}'."""

or:
f"Create a pull request for the branch, attempt to add reviewers, and assign '{ASSIGNEE}'." (since the triple-quotes aren't actually necessary here)

original_dir = os.getcwd() # Save the current directory
try:
os.chdir(repo_path) # Change to the repo's directory
Expand Down Expand Up @@ -195,7 +195,7 @@ def create_pull_request(repo_path, branch_name, default_branch):

if (
"Reviewers could not be requested" in result
or "Assignee could not be added" in result
or f"{ASSIGNEE} could not be added" in result
):
logging.warning(
"Attempting to add 'cloud-gov-pages-operations' as a fallback reviewer."
Expand All @@ -208,10 +208,10 @@ def create_pull_request(repo_path, branch_name, default_branch):
else:
logging.info("Reviewer successfully added.")

if "Assignee could not be added" in result:
logging.error("Failed to add 'wz-gsa' as the assignee.")
if f"{ASSIGNEE} could not be added" in result:
logging.error(f"Failed to add '{ASSIGNEE}' as the assignee.")
else:
logging.info("'wz-gsa' successfully assigned to the PR.")
logging.info(f"'{ASSIGNEE}' successfully assigned to the PR.")

except Exception as e:
logging.error(
Expand Down
106 changes: 106 additions & 0 deletions github/parse_everything_github_to_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
import sys
import requests
import zipfile
import io
import argparse
import logging
from typing import List

# Setting up basic configuration for logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def get_excluded_files() -> List[str]:
"""
Returns a list of filenames to be excluded from processing.
These are typically non-code files that do not contain useful information
for analysis or model training.
"""
return ["README.md", "README", "LICENSE", "LICENSE.txt"]


def is_excluded_file(file_path: str, excluded_files: List[str]) -> bool:
"""
Determines whether a file should be excluded based on its filename ending.
Args:
file_path: The path of the file within the repository.
excluded_files: A list of filename endings to exclude.
Returns:
True if the file is to be excluded, False otherwise.
"""
return any(file_path.endswith(ex_file) for ex_file in excluded_files)


def has_sufficient_content(file_content: str, min_line_count: int = 10) -> bool:
"""
Checks if the file content has at least a minimum number of non-empty lines.
Args:
file_content: The content of the file as a string.
min_line_count: The minimum number of non-empty lines required for the file to be included.
Returns:
True if the content meets the minimum line count, False otherwise.
"""
lines = [line for line in file_content.split("\n") if line.strip()]
return len(lines) >= min_line_count


def download_and_process_files(
repo_url: str, output_file: str, branch_or_tag: str = "master"
):
"""
Downloads and processes files from a GitHub repository archive.
Args:
repo_url: The URL of the GitHub repository.
output_file: The path to the output text file where combined contents will be stored.
branch_or_tag: The branch or tag to download from the repository.
"""
excluded_files = get_excluded_files()
download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"

try:
response = requests.get(download_url)
response.raise_for_status() # Raises HTTPError for bad requests (4XX or 5XX)

with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
with open(output_file, "w", encoding="utf-8") as outfile:
for file_path in zip_file.namelist():
if file_path.endswith("/") or is_excluded_file(
file_path, excluded_files
):
continue
with zip_file.open(file_path) as file:
file_content = file.read().decode("utf-8")
if has_sufficient_content(file_content):
outfile.write(f"# File: {file_path}\n{file_content}\n\n")

logging.info(f"Combined source code saved to {output_file}")
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP Error occurred: {e}")
except requests.exceptions.RequestException as e:
logging.error(f"Error downloading the file: {e}")
except zipfile.BadZipFile:
logging.error(
"Error processing zip file: The downloaded file was not a valid zip file."
)
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and process files from a GitHub repository."
)
parser.add_argument("repo_url", type=str, help="The URL of the GitHub repository")
parser.add_argument(
"--branch_or_tag",
type=str,
help="The branch or tag of the repository to download",
default="master",
)
args = parser.parse_args()

output_file = f"{args.repo_url.split('/')[-1]}_combined.txt"
download_and_process_files(args.repo_url, output_file, args.branch_or_tag)
236 changes: 236 additions & 0 deletions github/parse_repo_contents_into_txt.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's this for? doesn't seem to be mentioned in the PR description or READMEs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's addition was unintentional on my part. updating my commit and reverting this to draft

Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import os
import sys
import requests
import zipfile
import io
import ast
import argparse
import logging
from typing import List, Dict

# Configure logging
logging.basicConfig(level=logging.INFO)


def get_language_config() -> Dict[str, Dict]:
"""
Returns a dictionary of language configurations including file extensions,
excluded directories, files, and indicators for test files.
"""
return {
"python": {
"extensions": [".py", ".pyw"],
"excluded_dirs": [
"docs",
"examples",
"tests",
"test",
"scripts",
"utils",
"benchmarks",
"__pycache__",
],
"excluded_files": [
"hubconf.py",
"setup.py",
".github",
".gitignore",
"LICENSE",
"README",
"stale.py",
"gen-card-",
"write_model_card",
],
"test_indicators": [
"import unittest",
"import pytest",
"from unittest",
"from pytest",
],
},
"go": {
"extensions": [".go"],
"excluded_dirs": [
"docs",
"examples",
"tests",
"test",
"scripts",
"utils",
"benchmarks",
"vendor",
],
"excluded_files": [
"go.mod",
"go.sum",
"Makefile",
".github",
".gitignore",
"LICENSE",
"README",
],
"test_indicators": ["import testing", "func Test"],
},
"terraform": {
"extensions": [".tf", ".tfvars", ".hcl"],
"excluded_dirs": ["examples", "tests", "docs"],
"excluded_files": [".gitignore", "LICENSE", "README.md"],
"test_indicators": [],
},
"docker": {
"extensions": ["Dockerfile", ".dockerignore"],
"excluded_dirs": [],
"excluded_files": [".gitignore", "LICENSE", "README.md"],
"test_indicators": [],
},
"bosh": {
"extensions": [".yml"],
"excluded_dirs": ["docs", "examples", "tests", "test"],
"excluded_files": ["LICENSE", "README.md"],
"test_indicators": [],
},
"cloudfoundry": {
"extensions": [".yml"],
"excluded_dirs": ["docs", "examples", "tests", "test"],
"excluded_files": ["LICENSE", "README.md"],
"test_indicators": [],
},
}


def is_file_type(file_path: str, extensions: List[str]) -> bool:
"""Check if the file is of a type specified by extensions."""
return any(file_path.endswith(ext) or file_path == ext for ext in extensions)


def is_excluded_file(
file_path: str, excluded_dirs: List[str], excluded_files: List[str]
) -> bool:
"""Check if the file should be excluded based on directories or file names."""
if any(
file_path.startswith(f"{ex_dir}/") or f"/{ex_dir}/" in file_path
for ex_dir in excluded_dirs
):
return True
return file_path.split("/")[-1] in excluded_files


def has_test_indicators(content: str, indicators: List[str]) -> bool:
"""Check if file content contains test indicators specific to a language."""
return any(indicator in content for indicator in indicators)


def has_sufficient_content(file_content: str, min_line_count: int = 10) -> bool:
"""Check if the file content has a sufficient number of substantive lines."""
lines = [
line
for line in file_content.split("\n")
if line.strip() and not line.strip().startswith(("#", "//"))
]
return len(lines) >= min_line_count


def remove_comments_and_docstrings(source: str) -> str:
"""Remove comments and docstrings from Python source code."""
try:
tree = ast.parse(source)
for node in ast.walk(tree):
if isinstance(
node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)
) and ast.get_docstring(node):
node.body = node.body[1:] # Remove docstring
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
node.value.s = "" # Remove comments
return ast.unparse(tree)
except SyntaxError as e:
logging.error(f"Error parsing Python source: {e}")
return source # Return original source if it cannot be parsed


def is_likely_useful_file(file_path: str, language: str) -> bool:
"""
Determines if the file is likely to be useful by checking against configured exclusions.
"""
config = get_language_config()[language]
return not is_excluded_file(
file_path, config["excluded_dirs"], config["excluded_files"]
)


def download_and_process_files(
repo_url: str,
output_file: str,
language: str,
keep_comments: bool,
branch_or_tag: str = "master",
):
"""Download and process files from a GitHub repository based on language settings."""
try:
config = get_language_config()[language]
download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"
response = requests.get(download_url)

if response.status_code == 200:
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
with open(output_file, "w", encoding="utf-8") as outfile:
for file_path in zip_file.namelist():
if (
file_path.endswith("/")
or not is_file_type(file_path, config["extensions"])
or not is_likely_useful_file(file_path, language)
):
continue
file_content = zip_file.read(file_path).decode("utf-8")

if has_test_indicators(
file_content, config["test_indicators"]
) or not has_sufficient_content(file_content):
continue
if language == "python" and not keep_comments:
file_content = remove_comments_and_docstrings(file_content)

comment_tag = "//" if language == "go" else "#"
outfile.write(
f"{comment_tag} File: {file_path}\n{file_content}\n\n"
)
logging.info(
f"Combined {language.capitalize()} source code saved to {output_file}"
)
else:
logging.error(
f"Failed to download the repository. Status code: {response.status_code}"
)
except Exception as e:
logging.error(f"An error occurred: {e}")
sys.exit(1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and process files from a GitHub repository."
)
parser.add_argument("repo_url", type=str, help="The URL of the GitHub repository")
parser.add_argument(
"--lang",
type=str,
choices=get_language_config().keys(),
default="python",
help="The programming language of the repository",
)
parser.add_argument(
"--keep-comments",
action="store_true",
help="Keep comments and docstrings in the source code (only applicable for Python)",
)
parser.add_argument(
"--branch_or_tag",
type=str,
help="The branch or tag of the repository to download",
default="master",
)
args = parser.parse_args()

output_file = f"{args.repo_url.split('/')[-1]}_{args.lang}.txt"
download_and_process_files(
args.repo_url, output_file, args.lang, args.keep_comments, args.branch_or_tag
)
Loading