gather-info

#!/usr/bin/env python3
#
# Collect input data from various sources and hand it to the PSM dashboard.
#
# Copyright 2018 Open Tech Strategies, LLC
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# This script grew gradually from 'google-sheets-api-example.py' in
# github.com/OpenTechStrategies/ots-tools/blob/master/google-sheets-client/;
# see there for Google Sheets API resources etc.

"""Make JSON input for the PSM dashboard, using data from various sources.

Start a virtualenv and install requirements.txt:

  $ virtualenv --python=python3.5 .
  $ source bin/activate
  $ pip install -U -r requirements.txt

  ## Edit the config file as needed.  Getting some values may be
  ## a bit of work: see https://gist.github.com/burnash/6771295 and
  ## https://developers.google.com/sheets/api/quickstart/python.
  ## Note that you will need to have a Google account, and that
  ## account must have access to the spreadsheet whose ID you're
  ## using as your "google_sheet_id."  For the GitHub API key, grant it
  ## access to the `public_repos` scope.
  $ cp psm-dashboard-config.json.tmpl psm-dashboard-config.json
  $ your-favorite-editor psm-dashboard-config.json

  ## This puts psm_reqs.py in your Python import path by putting it in
  ## the current directory.
  $ ln -s somewhere/psm/requirements/psm_reqs.py ./psm_reqs.py

Usage:

  $ ./gather-info > features-info.json

Default output is JSON data as described in the dashboard README.md.
(Note that some "PROGESS:" prints will appear on stderr; they are being
used as a cheap kind of progress meter.  Just redirect stdout as in
the above example to separate JSON output from the noise on stderr.)

Alternate usage:

  $ ./gather-info --featureless-reqs

With the '--featureless-reqs' option, show instead a list of exactly
those requirements (from the RTM) that are not associated with any
high-level feature.

To see this usage message, run with '-h' or '--help'.
"""

import httplib2
import apiclient
import oauth2client
import datetime
import getopt
import os
import sys
import re
import json
import psm_reqs
import github
import github.GithubException


class PSMFeature:
    """Class representing one PSM high-level feature."""
    def __init__(self, feature_id, description, subfeatures,
                 status, start_date, completed_date, requirements):
        """Create a new feature named .
        FEATURE_ID (string): The identifying name for this feature.
        DESCRIPTION (string): Description of this feature.
        SUBFEATURES (list of strings): Subfeature descriptions.
        STATUS (string or None): Status as specified in the source.
        START_DATE (string): ISO-8601 date (YYYY-MM-DD HH:MM:SS) or None
        COMPLETED_DATE (string): ISO-8601 date (YYYY-MM-DD HH:MM:SS) or None
        REQUIREMENTS (list of strings): List of requirement IDs."""
        # Alas, the @dataclass decorator is new in Python 3.7,
        # so we can't really depend on it being available here.
        # www.python.org/dev/peps/pep-0557
        # hackernoon.com/a-brief-tour-of-python-3-7-data-classes-22ee5e046517
        self.feature_id = feature_id
        self.description = description
        self.subfeatures = subfeatures
        # Normally we'd use "foo_bar"-style field names for the dates,
        # but since the spec requests "camelCase", we do that instead.
        self.startDate = start_date
        self.completedDate = completed_date
        if status is not None:
            self.status = clean_status(status)
        else:
            self.status = "NotStarted"
            # Since we started putting projected completion dates in the
            # spreadsheet, we can't use the mere presence of a completion
            # date to conclude that a feature is completed.  Instead, if
            # the completion date is in the future, then the feature is
            # "InProgress" rather than "Completed".  So we have to compare
            # the completion date to the current time.
            #
            # We do that with a string comparison, which works because
            # these are ISO-8601 dates at day granularity.
            #
            # There's some edge fuzziness here, not just because we ignore
            # timezone, but also because we ignore running time.  That is,
            # the now() for each PSMFeature instance we create is slightly
            # different.  It mostly doesn't matter, since we're lopping
            # off to day granularity anyway.  If you were to run this
            # script across midnight *and* someone happened to complete a
            # feature at exactly the wrong moment, then sure, you could
            # get a status of "InProgress" when it should say "Completed".
            now = datetime.datetime.now().isoformat()[0:10]
            if self.startDate is None:
                if self.completedDate is not None:
                    sys.stderr.write("WARNING: '%s' has empty startDate but a completedDate of %s\n"
                                     % (self.feature_id, self.completedDate))
            else:
                if self.startDate < now:
                    if (self.completedDate is None) or (self.completedDate < now):
                        self.status = "InProgress"
                    else:
                        self.status = "Completed"

        self.requirements = requirements
    def __str__(self):
        return """\
        PSM Feature:
          ID:             "%s"
          Description:    "%s"
          Subfeatures:    "%s"
          Status:         "%s"
          Start date:     "%s"
          Completed date: "%s"
          Requirements:   "%s"\n""" \
              % (self.feature_id.replace('"', '\\"'),
                 self.description.replace('"', '\\"'),
                 [sub.replace('"', '\\"') for sub in self.subfeatures],
                 self.status,
                 self.startDate,
                 self.completedDate,
                 [req_id for req_id in self.requirements])


class PSMIssue:
    """Class representing one PSM non-PR GitHub issue ticket."""
    def __init__(self, owner, repos, number, title, description,
                 status, start_date, completed_date, labels):
        """
        OWNER (string): The repository owner (GitHub user or organization).
        REPOS (string): The repository name (under OWNER at GitHub).
        NUMBER (integer): The issue number.
        TITLE (string): Issue title, a.k.a. summary.
        DESCRIPTION (string): First comment, a.k.a. body.
        STATUS (string): "Complete", "InProgress", or "NotStarted".
        START_DATE (string): An ISO-8601 date (YYYY-MM-DD HH:MM:SS)
        COMPLETED_DATE (string): An ISO-8601 date (YYYY-MM-DD HH:MM:SS)
        LABELS (list of strings): All the labels attached to this issue.
        """
        # Again, wish we had the new Python 3.7 @dataclass decorator.
        self.owner = owner
        self.repos = repos
        self.number = number
        self.url = "https://github.com/%s/%s/issues/%d" \
            % (self.owner, self.repos, self.number)
        self.title = title
        self.description = description
        self.status = "NotStarted"
        # The status logic will get more sophisticated.  If there's
        # been meaningful activity on an issue, e.g., like a pull
        # request attached, then self.status could be "InProgress".
        if status == "closed":
            self.status = "Completed"
        # Again, use "camelCase" because that's what the spec wants
        # for dates.
        # Could validate status and the two date fields here.
        # And should the dates be DateTime type, not string?
        self.startDate = start_date
        self.completedDate = completed_date
        self.labels = labels
    def __str__(self):
        return """PSM Issue:
          Number:         %d
          URL:            %s
          Title:          "%s"
          Description:    "%s"
          Status:         %s
          Start date:     %s
          Completed date: %s
          Labels:         %s\n""" \
              % (self.number,
                 self.url,
                 self.title.replace('"', '\\"'),
                 self.description.replace('"', '\\"'),
                 self.status,
                 self.start_date,
                 self.completed_date,
                 self.labels)


# Where is "class PSMRequirement", you ask?
#
# The class 'PSMRequirement' is defined in the psm_reqs.py module
# already.  We obtain instances of it from psm_reqs.get_reqs().


def clean_status(status):
    """
    Standardize status string to one of 'Completed', 'InProgress',
    'NotStarted', 'Ongoing'
    """
    # Clean up one of the following values, otherwise return the
    # current status string
    status_map = {
        "Complete": "Completed",
        "In Progress": "InProgress",
        "Not Started": "NotStarted"
    }
    return status_map.get(status, status)


def to_dict(obj):
    """Return a dict representation of this object suitable for JSON."""
    d = {}
    for key, value in obj.__dict__.items():
        d[key] = value
    return d


def main():
    list_featureless_reqs = False

    try:
        (opts, args) = getopt.getopt(
            sys.argv[1:], "h?",
            ["help", "usage", "featureless-reqs",])
    except getopt.GetoptError as err:
        sys.stderr.write(str(err))
        sys.stderr.write("\n")
        sys.exit(1)

    for opt, optarg in opts:
        if opt in ("-h", "-?", "--help", "--usage",):
            print(__doc__)
            sys.exit(0)
        elif opt in ("--featureless-reqs",):
            list_featureless_reqs = True

    # TODO: It would be nice to check access perms on the file
    # here and warn if they're too open.
    config = json.load(open('psm-dashboard-config.json'))

    ##### Get PSM features from the Google spreadsheet. #####
    # Grant read-only access
    google_api_scopes = 'https://www.googleapis.com/auth/spreadsheets.readonly'

    # TODO: Warn if this file is readable by anyone other than the user.
    google_creds_store = oauth2client.file.Storage('google-credentials.json')

    google_creds = google_creds_store.get()
    if not google_creds or google_creds.invalid:
        # Interactively prompt for access creds in the browser.
        flow = oauth2client.client.OAuth2WebServerFlow(
            client_id=config['google_sheet_client_id'],
            client_secret=config['google_sheet_client_secret'],
            scope=google_api_scopes,
            redirect_uri='TBD')
        google_creds = oauth2client.tools.run_flow(flow, google_creds_store)

    google_service = apiclient.discovery.build('sheets', 'v4', http=google_creds.authorize(httplib2.Http()))

    # Point Sheets API at the specified sheet.
    google_sheet_id = config['google_sheet_id']

    # Range must be given in "A1" notation, which is described in
    # https://developers.google.com/sheets/api/guides/concepts.
    google_sheet_range = config['google_sheet_range']
    google_sheet_result = google_service.spreadsheets().values().get(spreadsheetId=google_sheet_id,
                                                 range=google_sheet_range).execute()
    features_sheet_values = google_sheet_result.get('values', None)
    if features_sheet_values is None:
        sys.stderr.write("ERROR: Nothing found in Google Sheet '%s'."
                         % google_sheet_id)
        sys.exit(1)

    features = {}
    this_feature = None
    reqs_list_re = re.compile("\((psm-[A-Z][A-Z]-[0-9]+\\.[0-9]+(,\\s*)?)+\)")
    all_requirements = set()
    for row in features_sheet_values:
        feature_id = None
        description = None
        subfeatures = []
        status = None
        start_date = None
        completed_date = None
        this_feature_reqs = set()
        if len(row) > 0:
            if row[0].startswith("psm-feature-"):
                feature_id = row[0]
                if feature_id in features:
                    sys.stderr.write("ERROR: Found feature '%s' twice."
                                     % feature_id)
                    sys.exit(1)
                # Overview of this case: we're going to extract the
                # parenthesized req ID lists (however many of them
                # there are) from the description, resulting in two
                # things: a description that isn't cluttered up with
                # req IDs, and a separate list representing the
                # union of all of this feature's req IDs.
                description = row[1]
                next_desc_head = description
                unreqqed_desc = "" # description with reqs lists removed
                m = reqs_list_re.search(next_desc_head)
                while m is not None:
                    next_index = m.start(0) + len(m.group(0))
                    reqs_list = m.group(0)
                    reqs_list = reqs_list.replace("(", "").replace(")", "").replace(" ", "")
                    for req_id in reqs_list.split(","):
                        this_feature_reqs.add(req_id)
                        all_requirements.add(req_id)
                    # Remove this reqs list from the description.
                    if unreqqed_desc == "":
                        unreqqed_desc = next_desc_head[0:m.start(0)]
                    else:
                        prefix = unreqqed_desc
                        suffix = next_desc_head[0:m.start(0)]
                        # Replace any original separators with our own
                        # consistent semicolon.
                        if unreqqed_desc.endswith(" "):
                            prefix = prefix[0:-1]
                        if suffix.startswith(", "):
                            suffix = suffix[2:]
                        unreqqed_desc = prefix + "; " + suffix
                    next_desc_head = next_desc_head[next_index:]
                    m = reqs_list_re.search(next_desc_head)
                unreqqed_desc = unreqqed_desc.strip()
                if unreqqed_desc != "":
                    description = unreqqed_desc
                status = row[7] if len(row) > 7 else None
                start_date = row[8] if len(row) > 8 else None
                completed_date = row[9] if len(row) > 9 else None
                this_feature = PSMFeature(feature_id, description,
                                          subfeatures, status,
                                          start_date, completed_date,
                                          sorted(this_feature_reqs))
                features[feature_id] = this_feature
            elif ((this_feature is not None)
                  and (row[0] == '')
                  and (row[1] == '')):
                subfeature = row[5]
                if subfeature.startswith("- "):
                    subfeature = subfeature[2:]
                this_feature.subfeatures.append(subfeature)
            else:
                # Any other kind of non-empty row ends this feature
                this_feature = None
        else:
            # Any empty row ends this feature
            this_feature = None

    sys.stderr.write("\n")
    sys.stderr.write("PROGRESS: Found %d features (with %d reqs across them)\n"
                     % (len(features), len(all_requirements)))
    sys.stderr.write("\n")

    ##### Get PSM requirements from the config XLSX file. #####
    try:
        reqs = psm_reqs.get_reqs(config['psm_reqs'])
    except KeyError as e:
        sys.stderr.write("ERROR: Can't find RTM.xlsx.\n")
        sys.stderr.write("       You must set the 'psm_reqs' parameter in 'psm-dashboard-config.json'\n")
        sys.stderr.write("       to point to the requirements/RTM.xlsx spreadsheet that lives in\n")
        sys.stderr.write("       the PSM tree.\n")
        sys.stderr.write("       \n")
        raise e

    ##### Get the issue<->req mapping from GitHub. #####
    github_owner_name = config['github_owner']
    github_repos_name = config['github_repository']
    github_auth_token = config['github_auth_token_secret']
    github_api = github.Github(github_auth_token)
    # Using lazy polymorphism here: either "organization/repository"
    # or "username/repository" would work, and all the methods we
    # need exist in both types.
    try:
        github_owner = github_api.get_organization(github_owner_name)
    except github.UnknownObjectException as e:
        github_owner = github_api.get_user(github_owner_name)
    github_repos = github_owner.get_repo(github_repos_name)
    github_issues_raw = github_repos.get_issues(state="all")  # both open and closed
    issues = {}
    for raw_issue in github_issues_raw:
        # For development, uncomment this conditional to loop over
        # only a subset of issues, since waiting for the GitHub API's
        # just-in-time fetching can be slow.
        #
        # if raw_issue.number < 700 or raw_issue.number > 740:
        #     continue
        if raw_issue.number in issues:
            sys.stderr.write("ERROR: Found issue %d twice."
                             % raw_issue.number)
            sys.exit(1)
        if raw_issue.pull_request is None:
            labels = []
            for label in raw_issue.labels:
                labels.append(label.name)
            start_date = None
            completed_date = None
            if raw_issue.created_at is not None:
                start_date = str(raw_issue.created_at)[:10]
            if raw_issue.closed_at is not None:
                completed_date = str(raw_issue.closed_at)[:10]
            issue = PSMIssue(
                github_owner_name,
                github_repos_name,
                raw_issue.number,
                raw_issue.title,
                raw_issue.body,
                raw_issue.state,
                start_date,
                completed_date,
                sorted(labels))
            # Because it takes a long time to paginate through the issues,
            # we'll want some kind of progress meter.  For now, use this
            # debugging statement to know that things are moving along:
            sys.stderr.write("PROGRESS: ingested issue #%d\n" % issue.number)
            issues[issue.number] = issue

    ###### Link issues into requirements. #####

    # Dictionary mapping req IDs to sets of issue numbers (ints).
    reqs_to_issues = {}
    for issue in issues.values():
        sys.stderr.write("PROGRESS: checking issue #%d for req labels\n" % issue.number)
        for label in issue.labels:
            if label.startswith("Z-REQ-PSM-"):
                req_id = "psm-" + label[10:]
                sys.stderr.write("PROGRESS:    found req: '%s'\n" % req_id)
                reqs_to_issues.setdefault(req_id, set()).add(issue.number)
    for req in reqs.values():
        # Just shoehorn it on in there.  Thanks, Python.
        setattr(req, "issues", sorted(list(reqs_to_issues.setdefault(req.req_id, set()))))

    ##### Set start dates and completed dates. #####

    # Some of the statuses, start dates, and completed dates are still
    # placeholders -- we'll need to calculate them.  (Though at least
    # in PSMIssue we already have the dates trivially from GitHub).
    # Also, since PSMRequirement doesn't natively have these fields,
    # we add them here, albeit still as placeholders.
    for req in reqs.values():
        setattr(req, "startDate", None)
        setattr(req, "completedDate", None)
        setattr(req, "status", "NotStarted")

    ##### Output, finally. #####

    if list_featureless_reqs:
        # Maps req IDs to sets of feature IDs.  This tells you
        # which features are associated with a given requirement.
        sys.stderr.write("PROGRESS: Listing featureless requirements\n")
        req_ids_to_features = {}
        for feature in features.values():
            sys.stderr.write("PROGRESS: probing feature '%s'\n" % feature.feature_id)
            for req_id in feature.requirements:
                sys.stderr.write("PROGRESS: registering req '%s'\n" % req_id)
                req_ids_to_features.setdefault(req_id, set()).add(feature.feature_id)
        # But then we don't actually use all the data we've gathered.
        # We just ask the boolean question "Does a given requirement
        # have any features associated with it?".  However, later if
        # we want to show *which* features are associated with a given
        # requirement, for those reqs that have features at all, we
        # will easily be able to do so.
        for req_id in reqs:
            if req_id not in req_ids_to_features:
                req = reqs[req_id]
                print("* No associated features for requirement '%s':" % req_id)
                print("")
                print("  %s" % req.description)
                if req.comment is not None and req.comment != "":
                    print("")
                    print("  %s" % req.comment)
                print("")
    else:
        # The top-level dict that will printed as JSON.
        output = {}
        # This code could be a little D.R.Y.-er, hmm.
        output["features"] = {}
        for feature_id, feature in features.items():
            output["features"][feature_id] = to_dict(feature)
        output["requirements"] = {}
        for req_id, req in reqs.items():
            output["requirements"][req_id] = to_dict(req)
        output["issues"] = {}
        for issue_number, issue in issues.items():
            output["issues"][issue_number] = to_dict(issue)
        print("%s" % json.dumps(output, indent=4))


if __name__ == '__main__':
    main()