Merge branch 'main' into i603

datactive · Oct 10, 2023 · 2d90b14 · 2d90b14
2 parents 20647fe + 4d63205
commit 2d90b14
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 21 deletions.
diff --git a/bigbang/analysis/datatracker.py b/bigbang/analysis/datatracker.py
@@ -1,13 +1,17 @@
-from ietfdata.datatracker import *
+"""
+Scripts for processing data from the IETF DataTracker
+"""
+
+from ietfdata.datatracker import *
 from ietfdata.datatracker_ext import *
-from dateutil.parser  import *
+from dateutil.parser import *
 import json as json
+
 import pandas as pd
 import re
 
 dt = DataTrackerExt()
 
-
 def draft_authors_from_working_group(acr):
  """
  Get a dataframe of all authors of drafts published
@@ -49,4 +53,78 @@ def draft_authors_from_working_group(acr):
 
  df = pd.DataFrame.from_records(records)
 
- return df
+ return df
+
+em_re = "/api/v1/person/email/([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})/"
+
+def email_from_uri(email_uri):
+ m = re.match(em_re, email_uri)
+
+ return m.group(1) if m else None
+
+
+dt = DataTracker(use_cache=True)
+
+
+def get_group_histories(wg_name):
+ """
+ For a working group name, get the group history objects
+ associated with that working group.
+ """
+ wg = dt.group_from_acronym(wg_name)
+ group_histories = dt.group_histories(group=wg)
+
+ group_role_histories = [
+ dt.group_role_histories(
+ group=grp_hist,
+ name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")),
+ )
+ for grp_hist in group_histories
+ ]
+
+ return group_histories, group_role_histories
+
+
+def leadership_ranges(group_acronym):
+ """
+ For a working group acronym,
+ get the data about the changes to the Chair role
+ in that working group history.
+ """
+ wg = dt.group_from_acronym(group_acronym)
+ group_histories = dt.group_histories(group=wg)
+
+ gh = list(group_histories)
+
+ gh_chair_records = [
+ [
+ {
+ "datetime_max": h.time,
+ "datetime_min": h.time,
+ "email": email_from_uri(r.email.uri),
+ "person_uri": r.person.uri,
+ "name": dt.person(r.person).name,
+ "biography": dt.person(r.person).biography,
+ }
+ for r in list(
+ dt.group_role_histories(
+ group=h,
+ name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")),
+ )
+ )
+ ]
+ for h in gh
+ ]
+
+ gh_chair_records = sum(gh_chair_records, [])
+ ghcr_df = pd.DataFrame.from_records(gh_chair_records)
+
+ agged = ghcr_df.groupby(["name", "person_uri", "email", "biography"]).agg(
+ {"datetime_min": "min", "datetime_max": "max"}
+ )
+
+ agged["datetime_min"].replace({ghcr_df["datetime_min"].min(): None}, inplace=True)
+
+ agged["datetime_max"].replace({ghcr_df["datetime_max"].max(): None}, inplace=True)
+
+ return ghcr_df, agged
diff --git a/bigbang/utils.py b/bigbang/utils.py
@@ -85,7 +85,6 @@ def get_common_head(str1, str2, delimiter=None):
  else:
  # this is ugly control flow clean it
  if delimiter is not None:
-
  dstr1 = str1.split(delimiter)
  dstr2 = str2.split(delimiter)
 

diff --git a/docs/datasets/drafts.rst b/docs/datasets/drafts.rst
@@ -0,0 +1,16 @@
+Drafts
+********
+
+``BigBang`` supports access to drafts data of some SDOs.
+
+
+IETF Datatracker
+================
+
+BigBang can also be used to analyze data of IETF RFC drafts.
+
+It does this using the Glasgow IPL group's ``ietfdata`` `tool <https://github.com/glasgow-ipl/ietfdata>`_.
+
+The script takes an argument, the working group acronym
+
+``bigbang collect-draft-metadata --working-group httpbis``
diff --git a/docs/datasets/index.rst b/docs/datasets/index.rst
@@ -8,5 +8,6 @@ This section outlines, how various public mailing lists can be scraped from the
  :maxdepth: 2
 
  mailinglists
+ drafts
  ancillary
  git
diff --git a/docs/datasets/mailinglists.rst b/docs/datasets/mailinglists.rst
@@ -3,34 +3,40 @@ Mailinglists
 
 Below we describe, how the public mailing lists of each of the Internet standard developing organisations can be scraped from the web. Some mailng lists reach back to 1998 and are multiple GBs in size. Therefore, it can take a considerable amount of time to scrape an entire mailing list. This process can't be sped up, since one would commit a DDoS attack otherwise. So be prepared to leave your machine running over (multiple) night(s).
 
+
 IETF
 ================
 
-To scrabed public mailing lists of the Internet Engineering Task Force (IETF), there are two options outlined below.
+There are severa many ways to access the public mailing list data of the Internet Engineering Task Force (IETF).
 
-Public Mailman Web Archive
---------------------------
-BigBang comes with a script for collecting files from public Mailman web archives. An example of this is the
-`scipy-dev <http://mail.python.org/pipermail/scipy-dev/>`_ mailing list page. To collect the archives of the scipy-dev mailing list, run the following command from the root directory of this repository:
+The IETF documents many access methods `here <https://www.ietf.org/how/lists/>`_.
+We discuss several oprtions.
 
-``bigbang collect-mail --url http://mail.python.org/pipermail/scipy-dev/``
+Export from Web Interface
+-----------------------
 
-You can also give this command a file with several urls, one per line. One of these is provided in the `examples/` directory.
+The IETF allow logged in users to export up to 5000 messages at a time from their
+on-line `mail archive <https://mailarchive.ietf.org/arch/>`_.
 
-``bigbang collect-mail --file examples/urls.txt``
+These are downloaded as a compressed directory of ``.mbox`` files.
 
-Once the data has been collected, BigBang has functions to support analysis.
+Put this directoy in you `archives/` directory to make it available for analysis.
 
 
-Datatracker
------------
-BigBang can also be used to analyze data of IETF RFC drafts.
+Collect-mail from text archives
+-------------------------------
 
-It does this using the Glasgow IPL group's ``ietfdata`` `tool <https://github.com/glasgow-ipl/ietfdata>`_.
+The full email archives are also `available as text files <https://www.ietf.org/mail-archive/text/>`_` (``.mail``) 
+available through the web.
 
-The script takes an argument, the working group acronym
+BigBang comes with a script for collecting files like. For example, mail for a specific list can be collected using its archival URL.
 
-``bigbang collect-draft-metadata --working-group httpbis``
+For example, for the mailing list archive of the ``dnsop`` working group,
+run the following command from the root directory of this repository:
+
+``bigbang collect-mail --url https://www.ietf.org/mail-archive/text/dnsop/``
+
+More information is available in the CLI help interface: ``bigbang collect-mail --help``
 
 
 W3C
@@ -147,3 +153,18 @@ while an entire mailing list can be ingressed using
  "fields": "total",
  },
  )
+
+
+Public Mailman 1 Web Archive
+==============================
+
+BigBang comes with a script for collecting files from public Mailman 1 web archives. An example of this is the
+`scipy-dev <http://mail.python.org/pipermail/scipy-dev/>`_ mailing list page. To collect the archives of the scipy-dev mailing list, run the following command from the root directory of this repository:
+
+``bigbang collect-mail --url http://mail.python.org/pipermail/scipy-dev/``
+
+You can also give this command a file with several urls, one per line. One of these is provided in the `examples/` directory.
+
+``bigbang collect-mail --file examples/urls.txt``
+
+Once the data has been collected, BigBang has functions to support analysis.
diff --git a/examples/datatracker/Working Group Affiliations.ipynb b/examples/datatracker/Working Group Affiliations.ipynb
@@ -651,7 +651,7 @@
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
- "version": "3.9.7"
+ "version": "3.8.13"
  },
  "latex_envs": {
  "LaTeX_envs_menu_present": true,