cloud-gov · wz-gsa · Jun 12, 2024 · bengerman13 · Jun 12, 2024 · bengerman13
@@ -157,7 +157,7 @@ def add_commit_push_security_md(repo_path, branch_name):
 
 # Create a pull request
 def create_pull_request(repo_path, branch_name, default_branch):
-    """Create a pull request for the branch, attempt to add reviewers, and assign 'wz-gsa'."""
+    ""f"Create a pull request for the branch, attempt to add reviewers, and assign '{ASSIGNEE}'."""
     original_dir = os.getcwd()  # Save the current directory
     try:
         os.chdir(repo_path)  # Change to the repo's directory
@@ -195,7 +195,7 @@ def create_pull_request(repo_path, branch_name, default_branch):
 
         if (
             "Reviewers could not be requested" in result
-            or "Assignee could not be added" in result
+            or f"{ASSIGNEE} could not be added" in result
         ):
             logging.warning(
                 "Attempting to add 'cloud-gov-pages-operations' as a fallback reviewer."
@@ -208,10 +208,10 @@ def create_pull_request(repo_path, branch_name, default_branch):
         else:
             logging.info("Reviewer successfully added.")
 
-        if "Assignee could not be added" in result:
-            logging.error("Failed to add 'wz-gsa' as the assignee.")
+        if f"{ASSIGNEE} could not be added" in result:
+            logging.error(f"Failed to add '{ASSIGNEE}' as the assignee.")
         else:
-            logging.info("'wz-gsa' successfully assigned to the PR.")
+            logging.info(f"'{ASSIGNEE}' successfully assigned to the PR.")
 
     except Exception as e:
         logging.error(

@@ -0,0 +1,106 @@
+import os
+import sys
+import requests
+import zipfile
+import io
+import argparse
+import logging
+from typing import List
+
+# Setting up basic configuration for logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+def get_excluded_files() -> List[str]:
+    """
+    Returns a list of filenames to be excluded from processing.
+    These are typically non-code files that do not contain useful information
+    for analysis or model training.
+    """
+    return ["README.md", "README", "LICENSE", "LICENSE.txt"]
+
+
+def is_excluded_file(file_path: str, excluded_files: List[str]) -> bool:
+    """
+    Determines whether a file should be excluded based on its filename ending.
+    Args:
+        file_path: The path of the file within the repository.
+        excluded_files: A list of filename endings to exclude.
+    Returns:
+        True if the file is to be excluded, False otherwise.
+    """
+    return any(file_path.endswith(ex_file) for ex_file in excluded_files)
+
+
+def has_sufficient_content(file_content: str, min_line_count: int = 10) -> bool:
+    """
+    Checks if the file content has at least a minimum number of non-empty lines.
+    Args:
+        file_content: The content of the file as a string.
+        min_line_count: The minimum number of non-empty lines required for the file to be included.
+    Returns:
+        True if the content meets the minimum line count, False otherwise.
+    """
+    lines = [line for line in file_content.split("\n") if line.strip()]
+    return len(lines) >= min_line_count
+
+
+def download_and_process_files(
+    repo_url: str, output_file: str, branch_or_tag: str = "master"
+):
+    """
+    Downloads and processes files from a GitHub repository archive.
+    Args:
+        repo_url: The URL of the GitHub repository.
+        output_file: The path to the output text file where combined contents will be stored.
+        branch_or_tag: The branch or tag to download from the repository.
+    """
+    excluded_files = get_excluded_files()
+    download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"
+
+    try:
+        response = requests.get(download_url)
+        response.raise_for_status()  # Raises HTTPError for bad requests (4XX or 5XX)
+
+        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
+            with open(output_file, "w", encoding="utf-8") as outfile:
+                for file_path in zip_file.namelist():
+                    if file_path.endswith("/") or is_excluded_file(
+                        file_path, excluded_files
+                    ):
+                        continue
+                    with zip_file.open(file_path) as file:
+                        file_content = file.read().decode("utf-8")
+                        if has_sufficient_content(file_content):
+                            outfile.write(f"# File: {file_path}\n{file_content}\n\n")
+
+        logging.info(f"Combined source code saved to {output_file}")
+    except requests.exceptions.HTTPError as e:
+        logging.error(f"HTTP Error occurred: {e}")
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error downloading the file: {e}")
+    except zipfile.BadZipFile:
+        logging.error(
+            "Error processing zip file: The downloaded file was not a valid zip file."
+        )
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {e}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and process files from a GitHub repository."
+    )
+    parser.add_argument("repo_url", type=str, help="The URL of the GitHub repository")
+    parser.add_argument(
+        "--branch_or_tag",
+        type=str,
+        help="The branch or tag of the repository to download",
+        default="master",
+    )
+    args = parser.parse_args()
+
+    output_file = f"{args.repo_url.split('/')[-1]}_combined.txt"
+    download_and_process_files(args.repo_url, output_file, args.branch_or_tag)
@@ -0,0 +1,236 @@
+import os
+import sys
+import requests
+import zipfile
+import io
+import ast
+import argparse
+import logging
+from typing import List, Dict
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+
+
+def get_language_config() -> Dict[str, Dict]:
+    """
+    Returns a dictionary of language configurations including file extensions,
+    excluded directories, files, and indicators for test files.
+    """
+    return {
+        "python": {
+            "extensions": [".py", ".pyw"],
+            "excluded_dirs": [
+                "docs",
+                "examples",
+                "tests",
+                "test",
+                "scripts",
+                "utils",
+                "benchmarks",
+                "__pycache__",
+            ],
+            "excluded_files": [
+                "hubconf.py",
+                "setup.py",
+                ".github",
+                ".gitignore",
+                "LICENSE",
+                "README",
+                "stale.py",
+                "gen-card-",
+                "write_model_card",
+            ],
+            "test_indicators": [
+                "import unittest",
+                "import pytest",
+                "from unittest",
+                "from pytest",
+            ],
+        },
+        "go": {
+            "extensions": [".go"],
+            "excluded_dirs": [
+                "docs",
+                "examples",
+                "tests",
+                "test",
+                "scripts",
+                "utils",
+                "benchmarks",
+                "vendor",
+            ],
+            "excluded_files": [
+                "go.mod",
+                "go.sum",
+                "Makefile",
+                ".github",
+                ".gitignore",
+                "LICENSE",
+                "README",
+            ],
+            "test_indicators": ["import testing", "func Test"],
+        },
+        "terraform": {
+            "extensions": [".tf", ".tfvars", ".hcl"],
+            "excluded_dirs": ["examples", "tests", "docs"],
+            "excluded_files": [".gitignore", "LICENSE", "README.md"],
+            "test_indicators": [],
+        },
+        "docker": {
+            "extensions": ["Dockerfile", ".dockerignore"],
+            "excluded_dirs": [],
+            "excluded_files": [".gitignore", "LICENSE", "README.md"],
+            "test_indicators": [],
+        },
+        "bosh": {
+            "extensions": [".yml"],
+            "excluded_dirs": ["docs", "examples", "tests", "test"],
+            "excluded_files": ["LICENSE", "README.md"],
+            "test_indicators": [],
+        },
+        "cloudfoundry": {
+            "extensions": [".yml"],
+            "excluded_dirs": ["docs", "examples", "tests", "test"],
+            "excluded_files": ["LICENSE", "README.md"],
+            "test_indicators": [],
+        },
+    }
+
+
+def is_file_type(file_path: str, extensions: List[str]) -> bool:
+    """Check if the file is of a type specified by extensions."""
+    return any(file_path.endswith(ext) or file_path == ext for ext in extensions)
+
+
+def is_excluded_file(
+    file_path: str, excluded_dirs: List[str], excluded_files: List[str]
+) -> bool:
+    """Check if the file should be excluded based on directories or file names."""
+    if any(
+        file_path.startswith(f"{ex_dir}/") or f"/{ex_dir}/" in file_path
+        for ex_dir in excluded_dirs
+    ):
+        return True
+    return file_path.split("/")[-1] in excluded_files
+
+
+def has_test_indicators(content: str, indicators: List[str]) -> bool:
+    """Check if file content contains test indicators specific to a language."""
+    return any(indicator in content for indicator in indicators)
+
+
+def has_sufficient_content(file_content: str, min_line_count: int = 10) -> bool:
+    """Check if the file content has a sufficient number of substantive lines."""
+    lines = [
+        line
+        for line in file_content.split("\n")
+        if line.strip() and not line.strip().startswith(("#", "//"))
+    ]
+    return len(lines) >= min_line_count
+
+
+def remove_comments_and_docstrings(source: str) -> str:
+    """Remove comments and docstrings from Python source code."""
+    try:
+        tree = ast.parse(source)
+        for node in ast.walk(tree):
+            if isinstance(
+                node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)
+            ) and ast.get_docstring(node):
+                node.body = node.body[1:]  # Remove docstring
+            elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
+                node.value.s = ""  # Remove comments
+        return ast.unparse(tree)
+    except SyntaxError as e:
+        logging.error(f"Error parsing Python source: {e}")
+        return source  # Return original source if it cannot be parsed
+
+
+def is_likely_useful_file(file_path: str, language: str) -> bool:
+    """
+    Determines if the file is likely to be useful by checking against configured exclusions.
+    """
+    config = get_language_config()[language]
+    return not is_excluded_file(
+        file_path, config["excluded_dirs"], config["excluded_files"]
+    )
+
+
+def download_and_process_files(
+    repo_url: str,
+    output_file: str,
+    language: str,
+    keep_comments: bool,
+    branch_or_tag: str = "master",
+):
+    """Download and process files from a GitHub repository based on language settings."""
+    try:
+        config = get_language_config()[language]
+        download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"
+        response = requests.get(download_url)
+
+        if response.status_code == 200:
+            zip_file = zipfile.ZipFile(io.BytesIO(response.content))
+            with open(output_file, "w", encoding="utf-8") as outfile:
+                for file_path in zip_file.namelist():
+                    if (
+                        file_path.endswith("/")
+                        or not is_file_type(file_path, config["extensions"])
+                        or not is_likely_useful_file(file_path, language)
+                    ):
+                        continue
+                    file_content = zip_file.read(file_path).decode("utf-8")
+
+                    if has_test_indicators(
+                        file_content, config["test_indicators"]
+                    ) or not has_sufficient_content(file_content):
+                        continue
+                    if language == "python" and not keep_comments:
+                        file_content = remove_comments_and_docstrings(file_content)
+
+                    comment_tag = "//" if language == "go" else "#"
+                    outfile.write(
+                        f"{comment_tag} File: {file_path}\n{file_content}\n\n"
+                    )
+            logging.info(
+                f"Combined {language.capitalize()} source code saved to {output_file}"
+            )
+        else:
+            logging.error(
+                f"Failed to download the repository. Status code: {response.status_code}"
+            )
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and process files from a GitHub repository."
+    )
+    parser.add_argument("repo_url", type=str, help="The URL of the GitHub repository")
+    parser.add_argument(
+        "--lang",
+        type=str,
+        choices=get_language_config().keys(),
+        default="python",
+        help="The programming language of the repository",
+    )
+    parser.add_argument(
+        "--keep-comments",
+        action="store_true",
+        help="Keep comments and docstrings in the source code (only applicable for Python)",
+    )
+    parser.add_argument(
+        "--branch_or_tag",
+        type=str,
+        help="The branch or tag of the repository to download",
+        default="master",
+    )
+    args = parser.parse_args()
+
+    output_file = f"{args.repo_url.split('/')[-1]}_{args.lang}.txt"
+    download_and_process_files(
+        args.repo_url, output_file, args.lang, args.keep_comments, args.branch_or_tag
+    )