updating plot scripts for poster

softwaresaved · Jul 23, 2024 · 759a8b1 · 759a8b1
1 parent 7b1a3f1
commit 759a8b1
Show file tree

Hide file tree

Showing 5 changed files with 453 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -141,6 +141,7 @@ dmypy.json
 !src/analysis/eprints.ipynb
 
 *.png
+#!data/derived/plots/overall/*.png
 
 data/debug/*
 !data/debug/representative_set.csv

diff --git a/src/analysis/eprints.ipynb b/src/analysis/eprints.ipynb
diff --git a/src/analysis/mention_type_timeline.py b/src/analysis/mention_type_timeline.py
@@ -2,10 +2,24 @@
 import os
 import pandas as pd
 import seaborn as sns
+import matplotlib
+matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import numpy as np
 from datetime import timedelta
 
+SMALL_SIZE = 24
+MEDIUM_SIZE = 30
+
+plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
+#plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
+plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
+plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
+
 def main(githubdir, outdir):
     # load data mapping ePrints publication data to intent (produced by repo_intent.py)
     eprints_df = pd.read_csv(os.path.join(outdir, "eprints_w_intent.csv"), index_col=0)
@@ -17,7 +31,7 @@ def main(githubdir, outdir):
     df["created_at"] = pd.to_datetime(df.created_at)
     df["mention type"] = np.where(df["mention_created"], "created", "not created")
     # plot repo creation date against date listed in ePrints entry (assumed to be publication date)
-    ax = plt.axes()
+    fig, ax = plt.subplots(figsize=(10,8))
     ax.grid(True)
 
     xlim = [df["created_at"].min(), df["created_at"].max()]
@@ -34,12 +48,18 @@ def main(githubdir, outdir):
         x = "created_at",
         y = "eprints_date",
         hue="mention type",
+        s=80
     )
+
+    h,l = ax.get_legend_handles_labels()
+    ax.legend_.remove()
+    ax.legend(h, l, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.13), borderpad=0.2)
+
     ax.set(xlabel="GitHub repository creation date",
            ylabel="publication date")  # it's usually the publication date, though not always
-    ax.set_title("Mention type depending on distance between repo creation and publication date")
+    ax.set_title("Mention type depending on difference\nbetween repo creation and publication date", pad=45)
     plt.tight_layout()
-    plt.savefig(os.path.join(outdir, "plots/overall/mention_type_timeline.png"))
+    plt.savefig(os.path.join(outdir, "plots/overall/mention_type_timeline.png"), bbox_inches="tight", transparent=True)
 
 if __name__=="__main__":
     parser = argparse.ArgumentParser(

diff --git a/src/analysis/overall.py b/src/analysis/overall.py
@@ -6,9 +6,23 @@
 import string
 import re
 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
+import matplotlib
+matplotlib.use("Agg")
 from matplotlib import pyplot as plt
 from datetime import datetime
 
+SMALL_SIZE = 24
+MEDIUM_SIZE = 30
+
+plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
+#plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
+plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
+plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
+
 def info(verbose, msg):
     if verbose:
         print(f"[INFO] {msg}")
@@ -51,13 +65,14 @@ def plot_license_type(contents, ax):
         contents.license == "other", "unknown", "non-permissive")))
     # plot value counts
     contents.license_type.value_counts().sort_index().plot(
-        kind='bar',
+        kind='barh',
         ax=ax,
-        xlabel="license type",
-        ylabel="repository count"
+        #ylabel="license type",
+        #xlabel="repository count"
     )
     ax.bar_label(ax.containers[0])
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
+    ax.set_title("license type")
+    #ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
 
 def plot_contributing_file_present(contents, ax):
     """Plot a bar chart visualising the number of repositories with contribution guidelines.
@@ -67,13 +82,14 @@ def plot_contributing_file_present(contents, ax):
         ax (Axes): subplot to use
     """
     pd.notna(contents.contributing_added).value_counts().plot(
-        kind='bar',
+        kind='barh',
         ax=ax,
-        xlabel="contributing file",
-        ylabel="repository count"
+        #ylabel="contributing file",
+        #xlabel="repository count"
     )
     ax.bar_label(ax.containers[0])
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
+    ax.set_title("contributing file present")
+    #ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
 
 def plot_emojis(contents, ax):
     """Plot a histogram visualising the number of emojis found in repository READMEs.
@@ -88,9 +104,10 @@ def plot_emojis(contents, ax):
     counts, bins = np.histogram(contents.readme_emojis, bins)
     binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
     binlabels += [f"[{bins[-2]} - {bins[-1]}]"]
-    ax.bar(binlabels, counts)
+    ax.barh(binlabels, counts)
     ax.bar_label(ax.containers[0])
-    ax.set(xlabel="number of emojis in README", ylabel="repository count")
+    #ax.set(ylabel="number of emojis in README", xlabel="repository count")
+    ax.set_title("number of emojis in README")
 
 def plot_team_size(metadata, contributions, ax):
     """Plot a histogram visualising the maximum team size for a repository.
@@ -117,9 +134,11 @@ def plot_team_size(metadata, contributions, ax):
     counts, bins = np.histogram(max_team_size, bins)
     binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
     binlabels += [f"[{bins[-2]} - {bins[-1]}]"]
-    ax.bar(binlabels, counts)
+    ax.barh(binlabels, counts)
     ax.bar_label(ax.containers[0])
-    ax.set(xlabel="maximum team size", ylabel="repository count")
+    #ax.set(ylabel="maximum team size", xlabel="repository count")
+    ax.set(xlabel="repository count")
+    ax.set_title("maximum team size")
 
 def plot_readme_size(contents, ax, type="bar"):
     """Plot a histogram of the size of the README file found in repositories. The bin limits were chosen empirically.
@@ -131,18 +150,20 @@ def plot_readme_size(contents, ax, type="bar"):
     """
     bins = [0, 1, 300, 1500, 10000]
     binmeanings = ["none", "ultra-short", "short", "informative", "detailed"]
+    colours = ["#3875b1", "#f1882e", "#4d9d39", "#c73f30", "#8e6aba"]
     if contents.readme_size.max() > bins[-1]:
         bins.append(contents.readme_size.max())
     counts, bins = np.histogram(contents.readme_size, bins)
     binlabels = [f"{binmeanings[i]}\n[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
     binlabels += [f"{binmeanings[-1]}\n[{bins[-2]} - {bins[-1]}]"]
     if type=="bar":
-        ax.bar(binlabels, counts)
+        ax.barh(binlabels, counts)
         ax.bar_label(ax.containers[0])
         ax.tick_params(axis='x', labelrotation=45)
-        ax.set(xlabel="size of README in Bytes", ylabel="repository count")
+        ax.set(ylabel="size of README in Bytes", xlabel="repository count")
     elif type=="pie":
-        ax.pie(counts, labels=binlabels, autopct='%1.1f%%')
+        # remove empty bins
+        ax.pie(counts[np.nonzero(counts)], labels=np.array(binlabels)[np.nonzero(counts)], autopct='%1.1f%%', colors=np.array(colours)[np.nonzero(counts)])
         ax.set(xlabel="size of README in Bytes")
 
 def plot_headings(readme_df, ax):
@@ -170,8 +191,8 @@ def plot_headings(readme_df, ax):
         background_color="white",
         random_state=42
         ).generate(" ".join(headings))
-    ax.imshow(wordcloud)
     ax.set_axis_off()
+    ax.imshow(wordcloud)
     ax.set(title="README headings")
 
 def plot_table(metadata, stars, forks, ax):
@@ -189,18 +210,24 @@ def plot_table(metadata, stars, forks, ax):
     star_counts = stars.groupby("github_user_cleaned_url")["user"].count()
     star_counts.rename("stars_no", inplace=True)
     cell_text = [
-        [f"{age.mean():.2f}", f"{age.std():.2f}", f"{age.median():.1f}", f"{age.min():.0f}", f"{age.max():.0f}"],
-        [f"{fork_counts.mean():.2f}", f"{fork_counts.std():.2f}", f"{fork_counts.median():.1f}", f"{fork_counts.min():.0f}", f"{fork_counts.max():.0f}"],
-        [f"{star_counts.mean():.2f}", f"{star_counts.std():.2f}", f"{star_counts.median():.1f}", f"{star_counts.min():.0f}", f"{star_counts.max():.0f}"]
-        ]
+        [f"{age.mean():.2f}", f"{fork_counts.mean():.2f}", f"{star_counts.mean():.2f}"],
+        [f"{age.std():.2f}", f"{fork_counts.std():.2f}", f"{star_counts.std():.2f}"],
+        [f"{age.median():.0f}", f"{fork_counts.median():.0f}", f"{star_counts.median():.0f}"],
+        [f"{age.min():.0f}", f"{fork_counts.min():.0f}", f"{star_counts.min():.0f}"],
+        [f"{age.max():.0f}", f"{fork_counts.max():.0f}", f"{star_counts.max():.0f}"],
+    ]
     table = ax.table(cellText=cell_text,
-                     rowLabels=["age (weeks)", "forks", "stars"],
-                     colLabels=["mean", "std", "median", "min", "max"],
-                     loc='center right'
+                     colLabels=["age (weeks)", "forks", "stars"],
+                     rowLabels=["mean", "std", "median", "min", "max"],
+                     colWidths=[0.4, 0.3, 0.3],
+                     loc='center right',
+                     #bbox=(0, 0, 1, 1)
                      )
-    table.scale(0.85, 1)
+    table.auto_set_font_size(False)
+    table.set_fontsize(SMALL_SIZE)
+    table.scale(1, 3)
     ax.set_axis_off()
-    ax.set(title="stats")
+    ax.set_title("Summary statistics", pad=25)
 
 def main(data_dir, outdir, verbose, filter_path, tag):
     info(verbose, "Loading data...")
@@ -225,15 +252,15 @@ def main(data_dir, outdir, verbose, filter_path, tag):
         forks = forks.loc[forks.github_user_cleaned_url.isin(filtered)]
 
     info(verbose, "Plotting...")
-    fig = plt.figure(figsize=(18, 12))
-    ax1 = plt.subplot(2, 3, 1)
-    ax2 = plt.subplot(2, 3, 2)
-    ax4 = plt.subplot(2, 3, 4)
-    ax5 = plt.subplot(2, 3, 5)
-    ax3 = plt.subplot(6, 3, (3, 9))
-    ax6 = plt.subplot(6, 3, (12, 15))
-    ax7 = plt.subplot(6, 3, 18)
-    fig.tight_layout(h_pad=1, w_pad=5, rect=(0.05, 0.05, 0.95, 0.95))
+    fig = plt.figure(figsize=(20, 18))
+    ax5 = plt.subplot(16, 5, (61, 78))
+    ax4 = plt.subplot(16, 5, (46, 58), sharex=ax5)
+    ax2 = plt.subplot(16, 5, (21, 43), sharex=ax5)
+    ax1 = plt.subplot(16, 5, (1, 18), sharex=ax5)
+    ax3 = plt.subplot(8, 5, (4, 15))
+    ax6 = plt.subplot(8, 5, (19, 30))
+    ax7 = plt.subplot(8, 5, (34, 40))
+    fig.tight_layout(h_pad=0.5, w_pad=3, rect=(0.05, 0.05, 0.95, 0.95))
     plot_license_type(contents, ax1)
     plot_emojis(contents, ax2)
     plot_contributing_file_present(contents, ax4)
@@ -243,10 +270,10 @@ def main(data_dir, outdir, verbose, filter_path, tag):
     plot_table(metadata, stars, forks, ax7)
     if tag:
         plt.suptitle(f"Overall statistics for ePrints repositories ({tag})")
-        plt.savefig(os.path.join(outdir, "plots", "overall", f"overall_{tag}.png"), bbox_inches="tight")
+        plt.savefig(os.path.join(outdir, "plots", "overall", f"overall_{tag}.png"), bbox_inches="tight", transparent=True)
     else:
         plt.suptitle("Overall statistics for ePrints repositories")
-        plt.savefig(os.path.join(outdir, "plots", "overall", "overall.png"), bbox_inches="tight")
+        plt.savefig(os.path.join(outdir, "plots", "overall", "overall.png"), bbox_inches="tight", transparent=True)
 
 if __name__=="__main__":
     parser = argparse.ArgumentParser(