Skip to content

Commit

Permalink
updated utility scripts docs
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Jul 3, 2024
1 parent 2306819 commit 1b0e0ec
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 4 deletions.
3 changes: 3 additions & 0 deletions src/utils/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# `src/utils`

The scripts in this folder are utility scripts that are not required for mining or analysis.
17 changes: 13 additions & 4 deletions src/utils/create_representative_set_github.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""
Samples 100 repositories from Github based on different amounts of stars.
Some metadata about these repositories is extracted and stored in a dataframe which is written to ../data/representative_set.csv.
Some metadata about these repositories is extracted and stored in a dataframe.
"""

from github import Github, GithubException
import json
import argparse
import pandas as pd
import configparser
from tqdm import tqdm
Expand All @@ -16,7 +16,7 @@ def get_access_token():
str: Access Token
"""
config = configparser.ConfigParser()
config.read('../config.cfg')
config.read('../../config.cfg')
return config['ACCESS']['token']

def parse_samples(slice):
Expand Down Expand Up @@ -49,6 +49,15 @@ def compose_repo_link(row) -> str:
return link

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="create_representative_set_github",
description="Sample 100 repositories from Github based on different amounts of stars."
)
parser.add_argument("-o", "--output", type=str,
help="output path for representative set",
default="../../data/debug/representative_set.csv")
args = parser.parse_args()

g = Github(get_access_token())
samples = {}
stars_intervals = ["<1", "1..100", "100..1000", "1000..10000", ">10000"]
Expand All @@ -57,4 +66,4 @@ def compose_repo_link(row) -> str:
samples[interval] = parse_samples(result[:20])
df = pd.concat(samples.values())
df["github_id"] = df.apply(compose_repo_link, axis=1)
df.to_csv("../data/representative_set.csv", index=False)
df.to_csv(args.output, index=False)
101 changes: 101 additions & 0 deletions src/utils/rep_set_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb,github_id
jairourrego123,AppCalorias,0,1,0,20,1,621,jairourrego123/AppCalorias
hetpatel-web,SystemDevMetho,0,1,0,0,0,0,hetpatel-web/SystemDevMetho
morlzz111,mz.github.io,0,1,0,2,1,290,morlzz111/mz.github.io
manjeshboyapati,manjesh2,0,1,0,1,1,30,manjeshboyapati/manjesh2
ds4owd-001,md-02-assignments-AlejandraSuarezCobos,0,1,0,3,1,1412,ds4owd-001/md-02-assignments-AlejandraSuarezCobos
Arancium98,ucihydraulic,0,1,0,13,1,175197,Arancium98/ucihydraulic
Peramitul,Peramitul,0,1,0,0,0,0,Peramitul/Peramitul
acciojob,auto-complete-letsmailvjkumar,0,2,0,2,2,394,acciojob/auto-complete-letsmailvjkumar
JuanCohetes,traductor,0,1,0,2,1,58,JuanCohetes/traductor
DMG-01,ETH_TRANSFER_APP,0,1,0,8,1,173,DMG-01/ETH_TRANSFER_APP
distinctm1nd,rosy_cross_badge,0,1,0,0,0,0,distinctm1nd/rosy_cross_badge
aurelienheude,P5-Print-it-JS,0,1,0,5,1,3651,aurelienheude/P5-Print-it-JS
Andynico,Arabica_custom,0,1,0,35741,1,1066436,Andynico/Arabica_custom
MarcinhoLetsCode,Jan-Ken-Po_MultiPlayer_V1,0,1,0,1,1,0,MarcinhoLetsCode/Jan-Ken-Po_MultiPlayer_V1
Doston3300,barlow,0,1,0,1,1,2890,Doston3300/barlow
ava-smith,ava-smith,0,1,0,9,1,9,ava-smith/ava-smith
jesusdpdev,my-app-contador,0,1,0,2,1,194,jesusdpdev/my-app-contador
andrii-cherevko,andrii-cherevko.github.io,0,1,0,11,1,2962,andrii-cherevko/andrii-cherevko.github.io
Greavis,telebot_moderator,0,1,0,7,1,12,Greavis/telebot_moderator
imSHUBHANKAR,php-ProjectCA3,0,1,0,0,0,0,imSHUBHANKAR/php-ProjectCA3
RedisGraph,redisgraph.js,100,8,26,207,10,11654,RedisGraph/redisgraph.js
leoafarias,neardb,100,6,9,219,5,2272,leoafarias/neardb
vueComponent,vue-ref,100,5,14,13,2,19,vueComponent/vue-ref
now1then,vue-h5-pro,100,6,50,12,1,2476,now1then/vue-h5-pro
ivaylokenov,CSharp-ORM-Battle,99,4,1,7,1,2213,ivaylokenov/CSharp-ORM-Battle
HazyResearch,fonduer-tutorials,100,18,26,100,5,8514,HazyResearch/fonduer-tutorials
recoilme,slowpoke,100,8,9,141,4,2334,recoilme/slowpoke
Peihao2021,O-NKU,100,13,8,3,1,68320,Peihao2021/O-NKU
meizhiju,layered-bilstm-crf,100,5,26,3,1,56,meizhiju/layered-bilstm-crf
corellium,preloader-m1,100,13,9,14,2,60,corellium/preloader-m1
Kimundi,greenwasm,100,7,2,243,1,568,Kimundi/greenwasm
shouldnotappearcalm,yapi-plugin-interface-oauth2-token,100,5,22,55,4,83867,shouldnotappearcalm/yapi-plugin-interface-oauth2-token
elizarov,CoroutinesWorkshop,100,13,15,35,1,51979,elizarov/CoroutinesWorkshop
pathwar,pathwar,100,8,26,2864,28,37461,pathwar/pathwar
devpew,muffinReactNative,100,4,3,17,1,201,devpew/muffinReactNative
Nielk1,VSCView,100,10,5,299,4,18890,Nielk1/VSCView
0xC45,homelab-setup,100,3,8,65,1,106,0xC45/homelab-setup
erocoar,ggpol,100,4,10,168,3,1423,erocoar/ggpol
fancompute,qpga,100,10,16,42,2,15637,fancompute/qpga
imohamad,twitter-downloader-telegram-bot,100,5,31,5,0,5,imohamad/twitter-downloader-telegram-bot
ldqk,Masuit.MyBlogs,1000,36,285,1182,3,508818,ldqk/Masuit.MyBlogs
vuejs,test-utils,1000,26,236,2060,144,6050,vuejs/test-utils
dagger8224,dagger.js,1000,16,44,181,3,757,dagger8224/dagger.js
zhihu,rucene,1000,31,60,273,4,1915,zhihu/rucene
ratwithacompiler,OBS-captions-plugin,1000,26,69,189,2,2582,ratwithacompiler/OBS-captions-plugin
201853910,VMwareWorkstation,999,21,196,38,1,56,201853910/VMwareWorkstation
nccgroup,singularity,1000,32,142,210,5,3313,nccgroup/singularity
Kethsar,ytarchive,1000,26,87,302,16,213,Kethsar/ytarchive
Spu7Nix,SPWN-language,1000,18,61,1130,28,4166,Spu7Nix/SPWN-language
taigaio,taiga-docker,1000,16,270,127,13,563,taigaio/taiga-docker
labulakalia,crocodile,1000,20,163,161,6,14288,labulakalia/crocodile
colbyfayock,50-projects-for-react-and-the-static-web,1000,21,141,201,12,168,colbyfayock/50-projects-for-react-and-the-static-web
edvardHua,PoseEstimationForMobile,1000,53,261,86,5,167185,edvardHua/PoseEstimationForMobile
iximiuz,client-go-examples,1000,22,126,57,5,160,iximiuz/client-go-examples
open-source-labs,SvelteStorm,1000,17,114,690,27,35955,open-source-labs/SvelteStorm
kakaobrain,kogpt,1000,17,131,43,7,81,kakaobrain/kogpt
Rikj000,MoniGoMani,1000,64,162,1193,24,271688,Rikj000/MoniGoMani
Tsojan,TsojanScan,1000,14,56,13,1,51,Tsojan/TsojanScan
The-XSS-Rat,SecurityTesting,1000,47,258,165,1,15022,The-XSS-Rat/SecurityTesting
AGI-Edgerunners,LLM-Adapters,1000,12,90,180,5,76750,AGI-Edgerunners/LLM-Adapters
openspug,spug,9995,194,2025,1223,9,5538,openspug/spug
Baiyuetribe,paper2gui,9988,116,832,84,3,138737,Baiyuetribe/paper2gui
nvim-treesitter,nvim-treesitter,9985,48,845,5289,370,7114,nvim-treesitter/nvim-treesitter
SimplifyJobs,New-Grad-Positions,9968,1305,953,2078,263,2259,SimplifyJobs/New-Grad-Positions
pingcap,talent-plan,9957,251,1279,568,96,4110,pingcap/talent-plan
twitter,the-algorithm-ml,9954,101,2242,2,0,109,twitter/the-algorithm-ml
neovim,nvim-lspconfig,9946,84,2017,2970,445,4070,neovim/nvim-lspconfig
alexandresanlim,Badges4-README.md-Profile,9945,47,1532,1602,226,1465,alexandresanlim/Badges4-README.md-Profile
m-bain,whisperX,9940,123,1002,368,66,24060,m-bain/whisperX
Dujltqzv,Some-Many-Books,9939,122,1371,4,1,10,Dujltqzv/Some-Many-Books
aristocratos,bpytop,9926,155,406,408,32,1343,aristocratos/bpytop
sfyc23,EverydayWechat,9922,206,2265,214,13,359,sfyc23/EverydayWechat
lyhue1991,eat_tensorflow2_in_30_days,9912,269,2427,272,3,61324,lyhue1991/eat_tensorflow2_in_30_days
xenova,transformers.js,9910,71,592,1081,28,104477,xenova/transformers.js
chaitin,xray,9905,206,1781,821,108,35749,chaitin/xray
veeral-patel,how-to-secure-anything,9899,226,671,353,5,45025,veeral-patel/how-to-secure-anything
microsoft,wslg,9897,117,304,241,37,2031,microsoft/wslg
microsoft,STL,9894,249,1468,2079,200,29399,microsoft/STL
soxoj,maigret,9892,93,780,913,32,5860,soxoj/maigret
kubescape,kubescape,9891,96,816,3022,126,111568,kubescape/kubescape
codecrafters-io,build-your-own-x,40000,5227,26391,563,117,1065,codecrafters-io/build-your-own-x
996icu,996.ICU,40000,4224,21522,3205,398,187804,996icu/996.ICU
trekhleb,javascript-algorithms,40000,4359,29390,1104,195,13248,trekhleb/javascript-algorithms
CyC2018,CS-Notes,40000,5321,49300,3781,215,116179,CyC2018/CS-Notes
Significant-Gravitas,AutoGPT,40000,1562,43391,5357,440,129809,Significant-Gravitas/AutoGPT
jackfrued,Python-100-Days,40000,6137,51039,380,12,332089,jackfrued/Python-100-Days
Snailclimb,JavaGuide,40000,4523,44927,5419,416,175380,Snailclimb/JavaGuide
trimstray,the-book-of-secret-knowledge,40000,2410,8922,1068,98,1811,trimstray/the-book-of-secret-knowledge
AUTOMATIC1111,stable-diffusion-webui,40000,1049,25333,7384,430,35862,AUTOMATIC1111/stable-diffusion-webui
huggingface,transformers,40000,1099,24800,16262,433,234333,huggingface/transformers
labuladong,fucking-algorithm,40000,2311,22473,496,84,125717,labuladong/fucking-algorithm
microsoft,PowerToys,40000,1142,6613,7359,418,365855,microsoft/PowerToys
f,awesome-chatgpt-prompts,40000,1385,14631,434,80,759,f/awesome-chatgpt-prompts
GrowingGit,GitHub-Chinese-Top-Charts,40000,2576,12421,920,1,99430,GrowingGit/GitHub-Chinese-Top-Charts
denoland,deno,40000,1414,5340,11670,434,129247,denoland/deno
langchain-ai,langchain,40000,670,13956,10190,477,246053,langchain-ai/langchain
massgravel,Microsoft-Activation-Scripts,40000,895,8443,142,5,7997,massgravel/Microsoft-Activation-Scripts
microsoft,Web-Dev-For-Beginners,40000,2701,12222,1689,211,86694,microsoft/Web-Dev-For-Beginners
iptv-org,iptv,40000,1869,2198,30079,278,628268,iptv-org/iptv
tauri-apps,tauri,40000,498,2507,4677,356,83797,tauri-apps/tauri

0 comments on commit 1b0e0ec

Please sign in to comment.