Skip to content
This repository has been archived by the owner on Feb 8, 2018. It is now read-only.

Rebuild npm syncing #4438

Merged
merged 11 commits into from
May 5, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ language: python
git:
depth: 5
addons:
postgresql: 9.3
postgresql: 9.5
firefox: latest-esr
before_install:
- git branch -vv | grep '^*'
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Quick Start
Local
-----

Given Python 2.7, Postgres 9.3, and a C/make toolchain:
Given Python 2.7, Postgres 9.6, and a C/make toolchain:

```shell
git clone https://github.com/gratipay/gratipay.com.git
Expand Down Expand Up @@ -116,7 +116,7 @@ On Debian or Ubuntu you will need the following packages:

```shell
sudo apt-get install \
postgresql-9.3 \
postgresql-9.6 \
postgresql-contrib \
libpq-dev \
python-dev \
Expand Down Expand Up @@ -386,7 +386,7 @@ Modifying the Database
======================

We write SQL, specifically the [PostgreSQL
variant](https://www.postgresql.org/docs/9.3/static/). We keep our database
variant](https://www.postgresql.org/docs/9.6/static/). We keep our database
schema in
[`schema.sql`](https://github.com/gratipay/gratipay.com/blob/master/sql/schema.sql),
and we write schema changes for each PR branch in a `sql/branch.sql` file, which
Expand Down Expand Up @@ -436,11 +436,11 @@ database configured in your testing environment.
Local Database Setup
--------------------

For the best development experience, you need a local
installation of [Postgres](https://www.postgresql.org/download/). The best
version of Postgres to use is 9.3.5, because that's what we're using in
production at Heroku. You need at least 9.2, because we depend on being able to
specify a URI to `psql`, and that was added in 9.2.
For the best development experience, you need a local installation of
[Postgres](https://www.postgresql.org/download/). The best version of Postgres
to use is 9.6.2, because that's what we're using in production at Heroku. You
need at least 9.5 to support the features we depend on, along with the
`pg_stat_statments` and `pg_trgm` extensions.

+ Mac: use Homebrew: `brew install postgres`
+ Ubuntu: use Apt: `apt-get install postgresql postgresql-contrib libpq-dev`
Expand Down
1 change: 1 addition & 0 deletions defaults.env
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ EMAIL_QUEUE_ALLOW_UP_TO=3

UPDATE_CTA_EVERY=300
CHECK_DB_EVERY=600
CHECK_NPM_SYNC_EVERY=0
OPTIMIZELY_ID=
INCLUDE_PIWIK=no
SENTRY_DSN=
Expand Down
3 changes: 2 additions & 1 deletion gratipay/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import psycopg2.extras

from . import email, utils
from . import email, sync_npm, utils
from .cron import Cron
from .models import GratipayDB
from .payday_runner import PaydayRunner
Expand Down Expand Up @@ -53,6 +53,7 @@ def install_periodic_jobs(self, website, env, db):
cron(env.update_cta_every, lambda: utils.update_cta(website))
cron(env.check_db_every, db.self_check, True)
cron(env.email_queue_flush_every, self.email_queue.flush, True)
cron(env.check_npm_sync_every, lambda: sync_npm.check(db))


def add_event(self, c, type, payload):
Expand Down
46 changes: 22 additions & 24 deletions gratipay/cli/sync_npm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,38 @@
"""
from __future__ import absolute_import, division, print_function, unicode_literals

import sys
import argparse

from gratipay import wireup
from gratipay.sync_npm import serialize, upsert


def parse_args(argv):
p = argparse.ArgumentParser()
p.add_argument('command', choices=['serialize', 'upsert'])
p.add_argument('path', help='the path to the input file', nargs='?', default='/dev/stdin')
return p.parse_args(argv)
import time

from aspen import log

subcommands = { 'serialize': serialize.main
, 'upsert': upsert.main
}
from gratipay import wireup
from gratipay.sync_npm import consume_change_stream, get_last_seq, production_change_stream
from gratipay.utils import sentry


def main(argv=sys.argv):
def main():
"""This function is installed via an entrypoint in ``setup.py`` as
``sync-npm``.

Usage::

sync-npm {serialize,upsert} {<filepath>}

``<filepath>`` defaults to stdin.

.. note:: Sphinx is expanding ``sys.argv`` in the parameter list. Sorry. :-/
sync-npm

"""
env = wireup.env()
args = parse_args(argv[1:])
db = wireup.db(env)

subcommands[args.command](env, args, db)
while 1:
with sentry.teller(env):
last_seq = get_last_seq(db)
log("Picking up with npm sync at {}.".format(last_seq))
stream = production_change_stream(last_seq)
consume_change_stream(stream, db)
try:
last_seq = get_last_seq(db)
sleep_for = 60
log( 'Encountered an error, will pick up with %s in %s seconds (Ctrl-C to exit) ...'
% (last_seq, sleep_for)
)
time.sleep(sleep_for) # avoid a busy loop if thrashing
except KeyboardInterrupt:
return
84 changes: 84 additions & 0 deletions gratipay/sync_npm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import requests
from couchdb import Database


REGISTRY_URL = 'https://replicate.npmjs.com/'


def get_last_seq(db):
return db.one('SELECT npm_last_seq FROM worker_coordination')


def production_change_stream(seq):
"""Given a sequence number in the npm registry change stream, start
streaming from there!
"""
return Database(REGISTRY_URL).changes(feed='continuous', include_docs=True, since=seq)


def process_doc(doc):
"""Return a smoothed-out doc, or None if it's not a package doc, meaning
there's no name key and it's probably a design doc, per:

https://github.com/npm/registry/blob/aef8a275/docs/follower.md#clean-up

"""
if 'name' not in doc:
return None
name = doc['name']
description = doc.get('description', '')
emails = [e for e in [m.get('email') for m in doc.get('maintainers', [])] if e.strip()]
return {'name': name, 'description': description, 'emails': sorted(set(emails))}


def consume_change_stream(stream, db):
"""Given an iterable of CouchDB change notifications and a
:py:class:`~GratipayDB`, read from the stream and write to the db.

The npm registry is a CouchDB app, which means we get a change stream from
it that allows us to follow registry updates in near-realtime. Our strategy
here is to maintain open connections to both the registry and our own
database, and write as we read.

"""
with db.get_connection() as connection:
for change in stream:

# Decide what to do.
if change.get('deleted'):
op, arg = delete, change['id']
else:
processed_doc = process_doc(change['doc'])
if not processed_doc:
continue
op, arg = upsert, processed_doc

# Do it.
cursor = connection.cursor()
op(cursor, arg)
cursor.run('UPDATE worker_coordination SET npm_last_seq=%(seq)s', change)
connection.commit()


def delete(cursor, name):
cursor.run("DELETE FROM packages WHERE package_manager='npm' AND name=%s", (name,))


def upsert(cursor, processed):
cursor.run('''
INSERT INTO packages
(package_manager, name, description, emails)
VALUES ('npm', %(name)s, %(description)s, %(emails)s)

ON CONFLICT (package_manager, name) DO UPDATE
SET description=%(description)s, emails=%(emails)s
''', processed)


def check(db, _print=print):
ours = db.one('SELECT npm_last_seq FROM worker_coordination')
theirs = int(requests.get(REGISTRY_URL).json()['update_seq'])
_print("count#npm-sync-lag={}".format(theirs - ours))
43 changes: 0 additions & 43 deletions gratipay/sync_npm/__init__.py

This file was deleted.

107 changes: 0 additions & 107 deletions gratipay/sync_npm/serialize.py

This file was deleted.

Loading