Skip to content

Commit

Permalink
updating plot scripts for poster
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Jul 23, 2024
1 parent 7b1a3f1 commit 759a8b1
Show file tree
Hide file tree
Showing 5 changed files with 453 additions and 63 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ dmypy.json
!src/analysis/eprints.ipynb

*.png
#!data/derived/plots/overall/*.png

data/debug/*
!data/debug/representative_set.csv
Expand Down
121 changes: 97 additions & 24 deletions src/analysis/eprints.ipynb

Large diffs are not rendered by default.

26 changes: 23 additions & 3 deletions src/analysis/mention_type_timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,24 @@
import os
import pandas as pd
import seaborn as sns
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
from datetime import timedelta

SMALL_SIZE = 24
MEDIUM_SIZE = 30

plt.rc('font', size=SMALL_SIZE) # controls default text sizes
#plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE) # fontsize of the figure title

def main(githubdir, outdir):
# load data mapping ePrints publication data to intent (produced by repo_intent.py)
eprints_df = pd.read_csv(os.path.join(outdir, "eprints_w_intent.csv"), index_col=0)
Expand All @@ -17,7 +31,7 @@ def main(githubdir, outdir):
df["created_at"] = pd.to_datetime(df.created_at)
df["mention type"] = np.where(df["mention_created"], "created", "not created")
# plot repo creation date against date listed in ePrints entry (assumed to be publication date)
ax = plt.axes()
fig, ax = plt.subplots(figsize=(10,8))
ax.grid(True)

xlim = [df["created_at"].min(), df["created_at"].max()]
Expand All @@ -34,12 +48,18 @@ def main(githubdir, outdir):
x = "created_at",
y = "eprints_date",
hue="mention type",
s=80
)

h,l = ax.get_legend_handles_labels()
ax.legend_.remove()
ax.legend(h, l, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.13), borderpad=0.2)

ax.set(xlabel="GitHub repository creation date",
ylabel="publication date") # it's usually the publication date, though not always
ax.set_title("Mention type depending on distance between repo creation and publication date")
ax.set_title("Mention type depending on difference\nbetween repo creation and publication date", pad=45)
plt.tight_layout()
plt.savefig(os.path.join(outdir, "plots/overall/mention_type_timeline.png"))
plt.savefig(os.path.join(outdir, "plots/overall/mention_type_timeline.png"), bbox_inches="tight", transparent=True)

if __name__=="__main__":
parser = argparse.ArgumentParser(
Expand Down
99 changes: 63 additions & 36 deletions src/analysis/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,23 @@
import string
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib
matplotlib.use("Agg")
from matplotlib import pyplot as plt
from datetime import datetime

SMALL_SIZE = 24
MEDIUM_SIZE = 30

plt.rc('font', size=SMALL_SIZE) # controls default text sizes
#plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE) # fontsize of the figure title

def info(verbose, msg):
if verbose:
print(f"[INFO] {msg}")
Expand Down Expand Up @@ -51,13 +65,14 @@ def plot_license_type(contents, ax):
contents.license == "other", "unknown", "non-permissive")))
# plot value counts
contents.license_type.value_counts().sort_index().plot(
kind='bar',
kind='barh',
ax=ax,
xlabel="license type",
ylabel="repository count"
#ylabel="license type",
#xlabel="repository count"
)
ax.bar_label(ax.containers[0])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_title("license type")
#ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

def plot_contributing_file_present(contents, ax):
"""Plot a bar chart visualising the number of repositories with contribution guidelines.
Expand All @@ -67,13 +82,14 @@ def plot_contributing_file_present(contents, ax):
ax (Axes): subplot to use
"""
pd.notna(contents.contributing_added).value_counts().plot(
kind='bar',
kind='barh',
ax=ax,
xlabel="contributing file",
ylabel="repository count"
#ylabel="contributing file",
#xlabel="repository count"
)
ax.bar_label(ax.containers[0])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_title("contributing file present")
#ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

def plot_emojis(contents, ax):
"""Plot a histogram visualising the number of emojis found in repository READMEs.
Expand All @@ -88,9 +104,10 @@ def plot_emojis(contents, ax):
counts, bins = np.histogram(contents.readme_emojis, bins)
binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"[{bins[-2]} - {bins[-1]}]"]
ax.bar(binlabels, counts)
ax.barh(binlabels, counts)
ax.bar_label(ax.containers[0])
ax.set(xlabel="number of emojis in README", ylabel="repository count")
#ax.set(ylabel="number of emojis in README", xlabel="repository count")
ax.set_title("number of emojis in README")

def plot_team_size(metadata, contributions, ax):
"""Plot a histogram visualising the maximum team size for a repository.
Expand All @@ -117,9 +134,11 @@ def plot_team_size(metadata, contributions, ax):
counts, bins = np.histogram(max_team_size, bins)
binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"[{bins[-2]} - {bins[-1]}]"]
ax.bar(binlabels, counts)
ax.barh(binlabels, counts)
ax.bar_label(ax.containers[0])
ax.set(xlabel="maximum team size", ylabel="repository count")
#ax.set(ylabel="maximum team size", xlabel="repository count")
ax.set(xlabel="repository count")
ax.set_title("maximum team size")

def plot_readme_size(contents, ax, type="bar"):
"""Plot a histogram of the size of the README file found in repositories. The bin limits were chosen empirically.
Expand All @@ -131,18 +150,20 @@ def plot_readme_size(contents, ax, type="bar"):
"""
bins = [0, 1, 300, 1500, 10000]
binmeanings = ["none", "ultra-short", "short", "informative", "detailed"]
colours = ["#3875b1", "#f1882e", "#4d9d39", "#c73f30", "#8e6aba"]
if contents.readme_size.max() > bins[-1]:
bins.append(contents.readme_size.max())
counts, bins = np.histogram(contents.readme_size, bins)
binlabels = [f"{binmeanings[i]}\n[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"{binmeanings[-1]}\n[{bins[-2]} - {bins[-1]}]"]
if type=="bar":
ax.bar(binlabels, counts)
ax.barh(binlabels, counts)
ax.bar_label(ax.containers[0])
ax.tick_params(axis='x', labelrotation=45)
ax.set(xlabel="size of README in Bytes", ylabel="repository count")
ax.set(ylabel="size of README in Bytes", xlabel="repository count")
elif type=="pie":
ax.pie(counts, labels=binlabels, autopct='%1.1f%%')
# remove empty bins
ax.pie(counts[np.nonzero(counts)], labels=np.array(binlabels)[np.nonzero(counts)], autopct='%1.1f%%', colors=np.array(colours)[np.nonzero(counts)])
ax.set(xlabel="size of README in Bytes")

def plot_headings(readme_df, ax):
Expand Down Expand Up @@ -170,8 +191,8 @@ def plot_headings(readme_df, ax):
background_color="white",
random_state=42
).generate(" ".join(headings))
ax.imshow(wordcloud)
ax.set_axis_off()
ax.imshow(wordcloud)
ax.set(title="README headings")

def plot_table(metadata, stars, forks, ax):
Expand All @@ -189,18 +210,24 @@ def plot_table(metadata, stars, forks, ax):
star_counts = stars.groupby("github_user_cleaned_url")["user"].count()
star_counts.rename("stars_no", inplace=True)
cell_text = [
[f"{age.mean():.2f}", f"{age.std():.2f}", f"{age.median():.1f}", f"{age.min():.0f}", f"{age.max():.0f}"],
[f"{fork_counts.mean():.2f}", f"{fork_counts.std():.2f}", f"{fork_counts.median():.1f}", f"{fork_counts.min():.0f}", f"{fork_counts.max():.0f}"],
[f"{star_counts.mean():.2f}", f"{star_counts.std():.2f}", f"{star_counts.median():.1f}", f"{star_counts.min():.0f}", f"{star_counts.max():.0f}"]
]
[f"{age.mean():.2f}", f"{fork_counts.mean():.2f}", f"{star_counts.mean():.2f}"],
[f"{age.std():.2f}", f"{fork_counts.std():.2f}", f"{star_counts.std():.2f}"],
[f"{age.median():.0f}", f"{fork_counts.median():.0f}", f"{star_counts.median():.0f}"],
[f"{age.min():.0f}", f"{fork_counts.min():.0f}", f"{star_counts.min():.0f}"],
[f"{age.max():.0f}", f"{fork_counts.max():.0f}", f"{star_counts.max():.0f}"],
]
table = ax.table(cellText=cell_text,
rowLabels=["age (weeks)", "forks", "stars"],
colLabels=["mean", "std", "median", "min", "max"],
loc='center right'
colLabels=["age (weeks)", "forks", "stars"],
rowLabels=["mean", "std", "median", "min", "max"],
colWidths=[0.4, 0.3, 0.3],
loc='center right',
#bbox=(0, 0, 1, 1)
)
table.scale(0.85, 1)
table.auto_set_font_size(False)
table.set_fontsize(SMALL_SIZE)
table.scale(1, 3)
ax.set_axis_off()
ax.set(title="stats")
ax.set_title("Summary statistics", pad=25)

def main(data_dir, outdir, verbose, filter_path, tag):
info(verbose, "Loading data...")
Expand All @@ -225,15 +252,15 @@ def main(data_dir, outdir, verbose, filter_path, tag):
forks = forks.loc[forks.github_user_cleaned_url.isin(filtered)]

info(verbose, "Plotting...")
fig = plt.figure(figsize=(18, 12))
ax1 = plt.subplot(2, 3, 1)
ax2 = plt.subplot(2, 3, 2)
ax4 = plt.subplot(2, 3, 4)
ax5 = plt.subplot(2, 3, 5)
ax3 = plt.subplot(6, 3, (3, 9))
ax6 = plt.subplot(6, 3, (12, 15))
ax7 = plt.subplot(6, 3, 18)
fig.tight_layout(h_pad=1, w_pad=5, rect=(0.05, 0.05, 0.95, 0.95))
fig = plt.figure(figsize=(20, 18))
ax5 = plt.subplot(16, 5, (61, 78))
ax4 = plt.subplot(16, 5, (46, 58), sharex=ax5)
ax2 = plt.subplot(16, 5, (21, 43), sharex=ax5)
ax1 = plt.subplot(16, 5, (1, 18), sharex=ax5)
ax3 = plt.subplot(8, 5, (4, 15))
ax6 = plt.subplot(8, 5, (19, 30))
ax7 = plt.subplot(8, 5, (34, 40))
fig.tight_layout(h_pad=0.5, w_pad=3, rect=(0.05, 0.05, 0.95, 0.95))
plot_license_type(contents, ax1)
plot_emojis(contents, ax2)
plot_contributing_file_present(contents, ax4)
Expand All @@ -243,10 +270,10 @@ def main(data_dir, outdir, verbose, filter_path, tag):
plot_table(metadata, stars, forks, ax7)
if tag:
plt.suptitle(f"Overall statistics for ePrints repositories ({tag})")
plt.savefig(os.path.join(outdir, "plots", "overall", f"overall_{tag}.png"), bbox_inches="tight")
plt.savefig(os.path.join(outdir, "plots", "overall", f"overall_{tag}.png"), bbox_inches="tight", transparent=True)
else:
plt.suptitle("Overall statistics for ePrints repositories")
plt.savefig(os.path.join(outdir, "plots", "overall", "overall.png"), bbox_inches="tight")
plt.savefig(os.path.join(outdir, "plots", "overall", "overall.png"), bbox_inches="tight", transparent=True)

if __name__=="__main__":
parser = argparse.ArgumentParser(
Expand Down
Loading

0 comments on commit 759a8b1

Please sign in to comment.