diff --git a/README.md b/README.md index f70f5f6..ba6dd0a 100644 --- a/README.md +++ b/README.md @@ -114,13 +114,21 @@ The release versions that are sent to the Python package index (PyPI) are also t The versioning uses a three part version system, "a.b.c" - "a" represents a major release that may not be backwards compatible. "b" is incremented on minor releases that may contain extra features, but are backwards compatible. "c" releases are bug fixes or other micro changes that developers should feel free to immediately update to. +### Version 1.3 + +* **tag**: [v1.3](https://github.com/DistrictDataLabs/tribe/releases/tag/v1.3) +* **release**: Wednesday, July 6, 2016 +* **commit**: [see tag](#) + +After some feedback about the length of time it was taking to create the edges in the NetworkX graph, we modified the `FreqDist` object to memoize calls to N, B, and M. This means that on a per edge basis, far fewer complete traversals of the distribution are carried out. Already we have observed minutes worth of performance improvements as a result. The Graph also now carries more information including edge weights by frequency, count, and by L1 norm. The Graph itself carries email count and file size information data alongside other information. + ### Version 1.2 * **tag**: [v1.2](https://github.com/DistrictDataLabs/tribe/releases/tag/v1.2) * **release**: Wednesday, June 22, 2016 -* **commit**: [see tag](#) +* **commit**: [cac3d6c](https://github.com/DistrictDataLabs/tribe/commit/cac3d6cb3f95e9d114528d9beef5307c16ec7266) -In this release we have improved some of the handling code to make things a bit more robust with students who work on a variety of operating systems. For example we have added a progress indicator so that something appears to be happening on very large mbox files (and you're not left wondering). Additionally we have added better error handling so one bad email doesn't ruin your day. We also made the library Python 2.7 and Python 3.5 compatible with a better test suite. +In this release we have improved some of the handling code to make things a bit more robust with students who work on a variety of operating systems. For example we have added a progress indicator so that something appears to be happening on very large mbox files (and you're not left wondering). Additionally we have added better error handling so one bad email doesn't ruin your day. We also made the library Python 2.7 and Python 3.5 compatible with a better test suite. ### Version 1.1.2 diff --git a/docs/about.md b/docs/about.md index ae40658..8f1af1f 100644 --- a/docs/about.md +++ b/docs/about.md @@ -47,13 +47,21 @@ The release versions that are sent to the Python package index (PyPI) are also t The versioning uses a three part version system, "a.b.c" - "a" represents a major release that may not be backwards compatible. "b" is incremented on minor releases that may contain extra features, but are backwards compatible. "c" releases are bug fixes or other micro changes that developers should feel free to immediately update to. +### Version 1.3 + +* **tag**: [v1.3](https://github.com/DistrictDataLabs/tribe/releases/tag/v1.3) +* **release**: Wednesday, July 6, 2016 +* **commit**: [see tag](#) + +After some feedback about the length of time it was taking to create the edges in the NetworkX graph, we modified the `FreqDist` object to memoize calls to N, B, and M. This means that on a per edge basis, far fewer complete traversals of the distribution are carried out. Already we have observed minutes worth of performance improvements as a result. The Graph also now carries more information including edge weights by frequency, count, and by L1 norm. The Graph itself carries email count and file size information data alongside other information. + ### Version 1.2 * **tag**: [v1.2](https://github.com/DistrictDataLabs/tribe/releases/tag/v1.2) * **release**: Wednesday, June 22, 2016 -* **commit**: [see tag](#) +* **commit**: [cac3d6c](https://github.com/DistrictDataLabs/tribe/commit/cac3d6cb3f95e9d114528d9beef5307c16ec7266) -In this release we have improved some of the handling code to make things a bit more robust with students who work on a variety of operating systems. For example we have added a progress indicator so that something appears to be happening on very large mbox files (and you're not left wondering). Additionally we have added better error handling so one bad email doesn't ruin your day. We also made the library Python 2.7 and Python 3.5 compatible with a better test suite. +In this release we have improved some of the handling code to make things a bit more robust with students who work on a variety of operating systems. For example we have added a progress indicator so that something appears to be happening on very large mbox files (and you're not left wondering). Additionally we have added better error handling so one bad email doesn't ruin your day. We also made the library Python 2.7 and Python 3.5 compatible with a better test suite. ### Version 1.1.2 diff --git a/setup.py b/setup.py index 48894de..3b2c0ce 100755 --- a/setup.py +++ b/setup.py @@ -45,16 +45,18 @@ ## Define the classifiers ## See https://pypi.python.org/pypi?%3Aaction=list_classifiers CLASSIFIERS = ( - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Environment :: Console', - 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2.7', - 'Topic :: Software Development', - 'Topic :: Software Development :: Libraries :: Python Modules', + 'Programming Language :: Python :: 3.5', + 'Topic :: Communications :: Email', + 'Topic :: Scientific/Engineering :: Information Analysis', + 'Topic :: Scientific/Engineering :: Visualization', 'Topic :: Utilities', ) diff --git a/tests/__init__.py b/tests/__init__.py index e5900a1..2182516 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -24,7 +24,7 @@ ## Module Constants ########################################################################## -TEST_VERSION = "1.2" ## Also the expected version of the package +TEST_VERSION = "1.3" ## Also the expected version of the package ########################################################################## ## Initialization Tests diff --git a/tests/fixtures/headers.json b/tests/fixtures/headers.json index 2b3f354..f40a74d 100644 --- a/tests/fixtures/headers.json +++ b/tests/fixtures/headers.json @@ -1 +1 @@ -{"Content-Type": 140, "X-Notifications": 2, "X-Account-Notification-Type": 2, "Received": 397, "X-Received": 173, "From": 140, "Feedback-ID": 2, "To": 140, "Subject": 140, "Date": 140, "Message-ID": 140, "X-Priority": 1, "Received-SPF": 87, "X-GM-THRID": 140, "X-Google-DKIM-Signature": 86, "X-Gmail-Labels": 140, "Authentication-Results": 87, "MIME-Version": 140, "DKIM-Signature": 87, "Delivered-To": 87, "Return-Path": 220, "X-Tribe-Message-Count": 140, "X-Automattic-Destination": 1, "x-no-auto-attachment": 3, "X-Gm-Message-State": 86} \ No newline at end of file +{"X-Gmail-Labels": 140, "Delivered-To": 87, "From": 140, "Return-Path": 220, "X-Tribe-Message-Count": 140, "X-Gm-Message-State": 86, "To": 140, "Message-ID": 140, "X-Automattic-Destination": 1, "X-Received": 173, "X-Google-DKIM-Signature": 86, "Date": 140, "X-Notifications": 2, "Received": 397, "Received-SPF": 87, "Authentication-Results": 87, "X-Account-Notification-Type": 2, "MIME-Version": 140, "X-Priority": 1, "X-GM-THRID": 140, "x-no-auto-attachment": 3, "DKIM-Signature": 87, "Content-Type": 140, "Feedback-ID": 2, "Subject": 140} \ No newline at end of file diff --git a/tests/fixtures/test.graphml b/tests/fixtures/test.graphml index ae5e226..60d60c7 100644 --- a/tests/fixtures/test.graphml +++ b/tests/fixtures/test.graphml @@ -1,37 +1,55 @@ - - - + + + + + + + - Wed Jun 22 21:28:08 2016 -0400 - Email Network - fixtures/test.mbox + Wed Jul 06 13:42:31 2016 -0400 + fixtures/test.mbox + 922.8KiB + 140 + Email Network + - - - + + - 0.5428571428571428 + 76 + 1.0 + 0.542857142857 + + + 1 + 0.0131578947368 + 0.00714285714286 - 0.014285714285714285 + 2 + 0.0263157894737 + 0.0142857142857 - 0.40714285714285714 - - - 0.007142857142857143 + 57 + 0.75 + 0.407142857143 - - 0.007142857142857143 + + 3 + 0.0394736842105 + 0.0214285714286 - - 0.02142857142857143 + + 1 + 0.0131578947368 + 0.00714285714286 diff --git a/tests/stats_tests.py b/tests/stats_tests.py new file mode 100644 index 0000000..c97e651 --- /dev/null +++ b/tests/stats_tests.py @@ -0,0 +1,244 @@ +# tests.stats_tests +# Testing for the stats library in Tribe +# +# Author: Benjamin Bengfort +# Created: Wed Jul 06 11:34:10 2016 -0400 +# +# Copyright (C) 2014 District Data Labs +# For license information, see LICENSE.txt +# +# ID: stats_tests.py [] benjamin@bengfort.com $ + +""" +Testing for the stats library in Tribe +""" + +########################################################################## +## Imports +########################################################################## + +import os +import random +import unittest + +from tribe.stats import FreqDist + +try: + from cStringIO import StringIO +except ImportError: + from io import StringIO + + +########################################################################## +## Helper Functions +########################################################################## + +LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + +def random_characters(n, letters=LETTERS): + for _ in range(n): + yield random.choice(letters) + + +########################################################################## +## Frequency Distribution Tests +########################################################################## + +class FreqDistTests(unittest.TestCase): + + def test_random_chars(self): + """ + Assert the random chars generator works. + """ + data = list(random_characters(100)) + self.assertEqual(len(data), 100) + + for letter in data: + self.assertIn(letter, LETTERS) + + small = list(random_characters(10, 'abc')) + self.assertEqual(len(small), 10) + + for letter in small: + self.assertIn(letter, frozenset(['a', 'b', 'c'])) + + def test_n_samples(self): + """ + Test the computation of N, the number of samples + """ + dist = FreqDist(random_characters(100)) + self.assertEqual(dist.N, 100) + + def test_memoized_n_samples(self): + """ + Test the memoization of N, the number of samples + """ + dist = FreqDist(random_characters(100)) + self.assertEqual(dist.N, 100) + + for letter in random_characters(100): + dist[letter] += 1 + + self.assertEqual(dist.N, 100) + del dist.N + self.assertEqual(dist.N, 200) + + def test_b_bins(self): + """ + Test the computation of B, the number of bins + """ + dist = FreqDist(random_characters(1000)) + self.assertEqual(dist.B, 26) + + def test_memoized_b_bins(self): + """ + Test the memoization of B, the number of bins + """ + dist = FreqDist(random_characters(1000)) + self.assertEqual(dist.B, 26) + + for letter in random_characters(100, 'abcdef'): + dist[letter] += 1 + + self.assertEqual(dist.B, 26) + del dist.B + self.assertEqual(dist.B, 32) + + def test_m_magnitude(self): + """ + Test the computation of M, the magnitude + """ + dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') + self.assertEqual(dist.M, 7) + + def test_memoized_m_magnitude(self): + """ + Test the memoization of M, the magnitude + """ + dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') + self.assertEqual(dist.M, 7) + + for letter in 'aaabbccc': + dist[letter] += 1 + + self.assertEqual(dist.M, 7) + del dist.M + self.assertEqual(dist.M, 10) + + def test_freq(self): + """ + Test the computation of the frequency + """ + samples = list(random_characters(90, 'abc')) + samples.extend(['d']*10) + dist = FreqDist(samples) + + self.assertEqual(dist.N, len(samples)) + self.assertEqual(dist.B, 4) + self.assertAlmostEqual(dist.freq('d'), 0.1) + + for c in 'abc': + self.assertGreater(dist.freq(c), 0.0) + self.assertLess(dist.freq(c), 1.0) + + def test_empty_freq(self): + """ + Test the frequency of an empty distribution + """ + dist = FreqDist() + self.assertEqual(dist.freq('a'), 0) + + def test_norm(self): + """ + Test the computation of the norm + """ + samples = list(random_characters(50, 'abc')) + samples.extend(['d']*50) + dist = FreqDist(samples) + + self.assertEqual(dist.max(), 'd') + self.assertEqual(dist.N, len(samples)) + self.assertEqual(dist.M, 50) + self.assertAlmostEqual(dist.norm('d'), 1.0) + + for c in 'abc': + self.assertGreater(dist.norm(c), 0.0) + self.assertLess(dist.norm(c), 1.0) + + def test_empty_norm(self): + """ + Test the norm of an empty distribution + """ + dist = FreqDist() + self.assertEqual(dist.norm('a'), 0) + + def test_max(self): + """ + Test maximal element selection + """ + dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') + self.assertEqual(dist.max(), 'a') + + def test_empty_max(self): + """ + Test the frequency of an empty distribution + """ + dist = FreqDist() + self.assertIsNone(dist.max()) + + def test_ratio(self): + """ + Test the ratio computation + """ + dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') + self.assertAlmostEqual(dist.ratio('a', 'b'), 1.16666667) + self.assertAlmostEqual(dist.ratio('b', 'a'), 0.85714285) + + def test_missing_ratio(self): + """ + Test that ratio of an unseen element is 0 + """ + dist = FreqDist(random_characters(100, 'abc')) + self.assertEqual(dist.ratio('a', 'd'), 0) + + def test_inverse_ratio(self): + """ + Test that the ratio is correct for the inverse + """ + dist = FreqDist(random_characters(1000, 'abc')) + rtab = dist.ratio('a', 'b') + rtba = dist.ratio('b', 'a') + + riab = 1.0 / rtab + riba = 1.0 / rtba + + self.assertAlmostEqual(riab, rtba) + self.assertAlmostEqual(riba, rtab) + + def test_str(self): + """ + Test the stringification of the frequency distribution + """ + try: + dist = FreqDist(random_characters(1000, 'abc')) + s = str(dist) + r = repr(dist) + p = dist.pprint() + except Exception as e: + self.fail("Stringifcation failed: {}".format(e)) + + def test_dump_and_load(self): + """ + Test the serialization of frequency distribution + """ + fobj = StringIO() + orig = FreqDist(random_characters(1000)) + + # Dump the frequncy distribution to the stream + orig.dump(fobj) + + # Seek to 0 and load the frequency distribution + fobj.seek(0) + dist = FreqDist.load(fobj) + + self.assertEqual(orig, dist) diff --git a/tribe-admin.py b/tribe-admin.py index 9cd69e5..a06530b 100755 --- a/tribe-admin.py +++ b/tribe-admin.py @@ -18,17 +18,15 @@ ## Imports ########################################################################## -import os import sys import json import tribe import argparse import networkx as nx -from tribe.viz import * from tribe.utils import timeit -from tribe.stats import FreqDist from tribe.utils import humanizedelta +from tribe.viz import draw_social_network from tribe.extract import ConsoleMBoxReader as MBoxReader ########################################################################## @@ -165,17 +163,17 @@ def main(*args): # Draw Command draw_parser = subparsers.add_parser('draw', help='Draw a GraphML using the tribe draw method') - draw_parser.add_argument('-w', '--write', type=str, default='graph.png', help='Location to draw to') + draw_parser.add_argument('-w', '--write', type=str, default=None, help='Location to draw to') draw_parser.add_argument('graphml', nargs=1, type=argparse.FileType('r'), help='Location of GraphML to draw') draw_parser.set_defaults(func=draw) # Handle input from the command line args = parser.parse_args() # Parse the arguments - # try: - msg = args.func(args) # Call the default function - parser.exit(0, msg+"\n") # Exit cleanly with message - # except Exception as e: - # parser.error(str(e)) # Exit with error + try: + msg = args.func(args) # Call the default function + parser.exit(0, msg+"\n") # Exit cleanly with message + except Exception as e: + parser.error(str(e)) # Exit with error if __name__ == '__main__': main(*sys.argv[1:]) diff --git a/tribe/config.py b/tribe/config.py index 33546da..fc6b8af 100644 --- a/tribe/config.py +++ b/tribe/config.py @@ -46,4 +46,4 @@ class TribeConfiguration(confire.Configuration): settings = TribeConfiguration.load() if __name__ == '__main__': - print settings + print(settings) diff --git a/tribe/emails.py b/tribe/emails.py index 07512f4..cfaa83b 100644 --- a/tribe/emails.py +++ b/tribe/emails.py @@ -17,8 +17,6 @@ ## Imports ########################################################################## -import re - from collections import namedtuple from email.utils import parseaddr, formataddr from tribe.utils import unquote diff --git a/tribe/extract.py b/tribe/extract.py index 5c7f840..ea856d8 100644 --- a/tribe/extract.py +++ b/tribe/extract.py @@ -26,7 +26,7 @@ from email.utils import getaddresses from tribe.emails import EmailMeta, EmailAddress from tribe.progress import AsyncProgress as Progress -from tribe.utils import parse_date, strfnow, timeit, filesize +from tribe.utils import parse_date, strfnow, filesize ########################################################################## @@ -125,13 +125,15 @@ def relationships(email): # Keep track of all the email to email links - links = FreqDist() + links = FreqDist() + emails = 0 # Iterate over all the extracted emails # Catch exceptions, if any, and move forward # NOTE: This will allow the progress bar to work # NOTE: This will build the graph data structure in memory for email in self.extract(): + emails += 1 try: for combo in relationships(email): links[combo] += 1 @@ -139,10 +141,22 @@ def relationships(email): self.errors[e] += 1 continue - # Construct the networkx graph and add edges - G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) + # Construct the networkx graph with details about generation. + G = nx.Graph( + name="Email Network", mbox=self.path, + extracted=strfnow(), n_emails=emails, + mbox_size=filesize(self.path), + ) + + # Add edges to the graph with various weight properties from counts. + # NOTE: memoization is used here in the FreqDist to speed things up. for link in links.keys(): - G.add_edge(*link, weight=links.freq(link)) + link_data = { + "weight": links.freq(link), + "count": links[link], + "norm": links.norm(link), + } + G.add_edge(*link, **link_data) # Return the generated graph return G @@ -170,17 +184,22 @@ def __iter__(self): self.path, filesize(self.path) )) - bar = Progress() + # Build the progress bar + pbar = Progress() + + # Iterate through the messages and update the progress bar for msg in super(ConsoleMBoxReader, self).__iter__(): yield msg - bar.update() - bar.stop() + pbar.update() + + # Stop the progress bar and flush + pbar.stop() def count(self, refresh=False): """ Memoize the count function to minimize the reads of large MBox files. """ - if not hasattr(self, '_count') or not self._count or refresh: + if not hasattr(self, '_count') or refresh: self._count = sum(1 for _ in super(ConsoleMBoxReader, self).__iter__()) return self._count diff --git a/tribe/stats.py b/tribe/stats.py index 8cc524e..e45f2f9 100644 --- a/tribe/stats.py +++ b/tribe/stats.py @@ -21,6 +21,7 @@ from itertools import islice from collections import Counter +from tribe.utils import memoized ########################################################################## ## Frequency Distribution @@ -43,27 +44,58 @@ def load(klass, stream): dist[sample] = count return dist + @memoized def N(self): """ The total number of samples that have been recorded. For unique samples with counts greater than zero, use B. + + Note: N is memoized meaning if you change the frequency distribution + after accessing this property, you then need to del the property to + force a recomputation of the value. """ return sum(self.values()) + @memoized def B(self): """ Return the number of sample values or bins that have counts > 0. + + Note: B is memoized meaning if you change the frequency distribution + after accessing this property, you then need to del the property to + force a recomputation of the value. """ return len(self) + @memoized + def M(self): + """ + Returns the magnitude or the maximum count of all samples. + + Note: M is memoized meaning if you change the frequency distribution + after accessing this property, you then need to del the property to + force a recomputation of the value. + """ + if len(self) == 0: return 0 + return max(self.values()) + def freq(self, key): """ Returns the frequency of a sample defined as the count of the sample divided by the total number of outcomes. Frequencies are always real numbers in the range [0,1]. """ - if self.N() == 0: return 0 - return float(self[key]) / self.N() + if self.N == 0: return 0 + return float(self[key]) / self.N + + def norm(self, key): + """ + Returns the norm of a sample defined as the count of the sample + divided by the count of the most frequent sample. Norms are always + real numbers in the range [0,1]. + """ + if self.M == 0: return 0 + return float(self[key]) / self.M def ratio(self, a, b): """ @@ -97,7 +129,7 @@ def plot(self, *args, **kwargs): ylabel = "Counts" pylab.grid(True, color="silver") - if not "linewidth" in kwargs: + if "linewidth" not in kwargs: kwargs["linewidth"] = 2 if "title" in kwargs: @@ -127,4 +159,4 @@ def pprint(self, maxlen=10): return 'FreqDist({{{0}}})'.format(', '.join(items)) def __str__(self): - return "" % (self.B(), self.N()) + return "".format(self.B, self.N) diff --git a/tribe/utils.py b/tribe/utils.py index f066e94..32c596c 100644 --- a/tribe/utils.py +++ b/tribe/utils.py @@ -22,11 +22,11 @@ from functools import wraps from dateutil import parser +from datetime import datetime from dateutil.tz import tzlocal, tzutc -from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta from email.utils import unquote as email_unquote -from email.utils import parsedate_tz, parsedate, mktime_tz +from email.utils import parsedate_tz, mktime_tz ########################################################################## @@ -64,7 +64,7 @@ def parse_date(dtstr): # Otherwise use the dateutil parser return parser.parse(dtstr) - except: + except Exception: return None @@ -94,7 +94,7 @@ def humanizedelta(*args, **kwargs): ## Other Helpers and Decorators ########################################################################## -def unquote(str): +def unquote(s): """ Return a new string which is an unquoted version of str. If str ends and begins with double quotes, they are stripped off. Likewise if str @@ -102,8 +102,8 @@ def unquote(str): This method continues to unquote until the string is unchanged. """ - new = email_unquote(str) - if new != str: + new = email_unquote(s) + if new != s: return unquote(new) return new @@ -152,4 +152,8 @@ def fget_memoized(self): setattr(self, attr_name, fget(self)) return getattr(self, attr_name) - return property(fget_memoized) + def fdel(self): + if hasattr(self, attr_name): + delattr(self, attr_name) + + return property(fget_memoized, fdel=fdel) diff --git a/tribe/version.py b/tribe/version.py index ff70c0a..e422827 100644 --- a/tribe/version.py +++ b/tribe/version.py @@ -19,7 +19,7 @@ __version_info__ = { 'major': 1, - 'minor': 2, + 'minor': 3, 'micro': 0, 'releaselevel': 'final', 'serial': 0, diff --git a/tribe/viz.py b/tribe/viz.py index 102a9aa..485fd75 100644 --- a/tribe/viz.py +++ b/tribe/viz.py @@ -50,7 +50,7 @@ def show_simple_network(nodes=12, prob=0.2, hot=False): nx.draw_networkx_edges(G, pos, width=1.0, style='dashed', alpha=0.75) if hot: - center, degree = sorted(G.degree().items(), key=itemgetter(1))[-1] + center, _ = sorted(G.degree().items(), key=itemgetter(1))[-1] nx.draw_networkx_nodes(G, pos, nodelist=[center], node_size=600, node_color="#D9AF0B") plt.axis('off') @@ -59,7 +59,7 @@ def show_simple_network(nodes=12, prob=0.2, hot=False): return G @configure -def draw_social_network(G, path="graph.png", **kwargs): +def draw_social_network(G, path=None): k = 1/math.sqrt(G.order()) * 2 pos = nx.spring_layout(G, k=k) @@ -69,4 +69,7 @@ def draw_social_network(G, path="graph.png", **kwargs): nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90) nx.draw_networkx_edges(G, pos, width=1.0, style='dashed', alpha=0.75) - plt.show() + if path: + plt.savefig(path) + else: + plt.show()