Skip to content

Commit

Permalink
dataset generator
Browse files Browse the repository at this point in the history
  • Loading branch information
Kye committed Sep 7, 2023
1 parent b218e76 commit 034fcd7
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions pytorch/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import requests
import json
import base64

class GitHubDatasetGenerator:
def __init__(self, username, token):
self.username = username
self.token = token
self.base_url = "https://api.github.com"
self.headers = {'Authorization': f'token {self.token}'}

def get_repos(self):
url = f"{self.base_url}/users/{self.username}/repos"
response = requests.get(url, headers=self.headers)
return json.loads(response.text)

def get_files(self, repo_name):
url = f"{self.base_url}/repos/{self.username}/{repo_name}/contents"
response = requests.get(url, headers=self.headers)
return json.loads(response.text)

def get_file_content(self, download_url):
response = requests.get(download_url)
return response.text

def generate_dataset(self):
dataset = []
repos = self.get_repos()
for repo in repos:
repo_name = repo['name']
files = self.get_files(repo_name)
for file in files:
if file['name'].endswith('.py') or file['name'] == 'README.md':
file_content = self.get_file_content(file['download_url'])
dataset.append({
'repo': repo_name,
'file': file['name'],
'url': file['html_url'],
'content': file_content
})
return dataset

def save_dataset(self, dataset, filename):
with open(filename, 'w') as f:
for data in dataset:
f.write(json.dumps(data))
f.write('\n')


# generator = GitHubDatasetGenerator('username', 'token')
# dataset = generator.generate_dataset()
# generator.save_dataset(dataset, 'dataset.jsonl')

0 comments on commit 034fcd7

Please sign in to comment.