1
=== added directory 'Quotient/benchmarks'
2
=== added file 'Quotient/benchmarks/spambayes'
3
--- Quotient/benchmarks/spambayes	1970-01-01 00:00:00 +0000
4
+++ Quotient/benchmarks/spambayes	2012-08-25 13:12:19 +0000
5
@@ -0,0 +1,44 @@
6
1
#!/usr/bin/python
7
2
8
3
# Benchmark of Quotient spambayes filter, both training and classification.
9
4
10
5
import sys, tempfile, random, time
11
6
12
7
from xquotient.spam import _SQLite3Classifier
13
8
14
9
words = list(open('/usr/share/dict/words', 'r'))
15
10
16
11
TRAINING_FACTOR = 50
17
12
MESSAGE_FACTOR = 500
18
13
19
14
def adj(duration):
20
15
   return duration / (TRAINING_FACTOR * MESSAGE_FACTOR) * 1000.0
21
16
22
17
23
18
def main(argv):
24
19
   prng = random.Random()
25
20
   prng.seed(12345)
26
21
   prng.shuffle(words)
27
22
28
23
   classifier = _SQLite3Classifier(tempfile.mktemp())
29
24
30
25
   before = time.time()
31
26
   for i in range(TRAINING_FACTOR):
32
27
      classifier.learn(words[i:i + MESSAGE_FACTOR], True)
33
28
34
29
   for i in range(TRAINING_FACTOR, TRAINING_FACTOR * 2):
35
30
      classifier.learn(words[i:i + MESSAGE_FACTOR], False)
36
31
   after = time.time()
37
32
38
33
   print 'Learning: %.2f ms/word' % (adj(after - before),)
39
34
40
35
   before = time.time()
41
36
   for i in range(TRAINING_FACTOR * 2):
42
37
      classifier.spamprob(words[i:i + MESSAGE_FACTOR])
43
38
   after = time.time()
44
39
45
40
   print 'Guessing: %.2f ms/word' % (adj(after - before),)
46
41
47
42
48
43
if __name__ == '__main__':
49
44
   main(sys.argv)
50
0
45
51
=== modified file 'Quotient/xquotient/spam.py'
52
--- Quotient/xquotient/spam.py	2012-08-20 19:51:00 +0000
53
+++ Quotient/xquotient/spam.py	2012-08-25 13:12:19 +0000
54
@@ -382,6 +382,21 @@
55
382
        statements.  These are executed any time the classifier database is
382
        statements.  These are executed any time the classifier database is
56
383
        opened, with the expected failure which occurs any time the schema has
383
        opened, with the expected failure which occurs any time the schema has
57
384
        already been initialized handled and disregarded.
384
        already been initialized handled and disregarded.
58
385
59
386
    @ivar _readCache: Word information that is already known, either because it
60
387
        has already been read from the database once or because we wrote the
61
388
        information to the database.  Keys are unicode tokens, values are
62
389
        three-sequences of token, nspam, and nham counts.  This is used to hold
63
390
        word info between two different Spambayes hooks, C{_getclues} and
64
391
        C{_wordinfoget}.  The former has access to all tokens in a particular
65
392
        document, the latter is a potato-programming mistake.  Loading all of
66
393
        the values at once in C{_getclues} is a big performance win.
67
394
68
395
    @ivar _writeCache: Word information that is on its way to the database due
69
396
        to training.  This has the same shape as C{_readCache}.  Word info is
70
397
        held here until training on one document is complete, then all the word
71
398
        info is dumped into the database in a single SQL operation (via
72
399
        I{executemany}).
73
385
    """
400
    """
74
386
401
75
387
    SCHEMA = [
402
    SCHEMA = [
76
@@ -413,6 +428,7 @@
77
413
        return get, set, None, doc
428
        return get, set, None, doc
78
414
    nspam = property(*nspam())
429
    nspam = property(*nspam())
79
415
430
80
431
81
416
    def nham():
432
    def nham():
82
417
        doc = """
433
        doc = """
83
418
        A property which reflects the number of messages trained as ham, while
434
        A property which reflects the number of messages trained as ham, while
84
@@ -438,6 +454,9 @@
85
438
        """
454
        """
86
439
        classifier.Classifier.__init__(self)
455
        classifier.Classifier.__init__(self)
87
440
        self.databaseName = databaseName
456
        self.databaseName = databaseName
88
457
        self._readCache = {}
89
458
        self._writeCache = {}
90
459
91
441
        # Open the database, possibly initializing it if it has not yet been
460
        # Open the database, possibly initializing it if it has not yet been
92
442
        # initialized, and then load the necessary global state from it (nspam,
461
        # initialized, and then load the necessary global state from it (nspam,
93
443
        # nham).
462
        # nham).
94
@@ -484,6 +503,51 @@
95
484
            self.cursor.execute('UPDATE state SET nspam=?, nham=?', (self._nspam, self._nham))
503
            self.cursor.execute('UPDATE state SET nspam=?, nham=?', (self._nspam, self._nham))
96
485
504
97
486
505
98
506
    def _getclues(self, wordstream):
99
507
        """
100
508
        Hook into the classification process to speed it up.
101
509
102
510
        See the base implementation for details about what C{_getclues} is
103
511
        supposed to do.  This implementation extends the base to look into
104
512
        wordstream and load all the necessary information with the minimum
105
513
        amount of SQLite3 work, then calls up to the base implementation to let
106
514
        it do the actual classification-related work.
107
515
108
516
        @param wordstream: An iterable (probably a generator) of tokens from the
109
517
            document to be classified.
110
518
        """
111
519
        # Make sure we can consume it and give it to the base implementation for
112
520
        # consumption.
113
521
        wordstream = list(wordstream)
114
522
115
523
        # Find all the tokens we don't have in memory already
116
524
        missing = []
117
525
        for word in wordstream:
118
526
            if isinstance(word, str):
119
527
                word = word.decode('utf-8', 'replace')
120
528
            if word not in self._readCache:
121
529
                missing.append(word)
122
530
123
531
        # Load their state
124
532
        while missing:
125
533
            # SQLite3 allows a maximum of 999 variables.
126
534
            load = missing[:999]
127
535
            del missing[:999]
128
536
            self.cursor.execute(
129
537
                "SELECT word, nspam, nham FROM bayes WHERE word IN (%s)" % (
130
538
                    ", ".join("?" * len(load))),
131
539
                load)
132
540
            rows = self.cursor.fetchall()
133
541
134
542
            # Save them for later
135
543
            for row in rows:
136
544
                self._readCache[row[0]] = row
137
545
138
546
        # Let the base class do its thing, which will involve asking us about
139
547
        # that state we just cached.
140
548
        return classifier.Classifier._getclues(self, wordstream)
141
549
142
550
143
487
    def _get(self, word):
551
    def _get(self, word):
144
488
        """
552
        """
145
489
        Load the training data for the given word.
553
        Load the training data for the given word.
146
@@ -497,13 +561,22 @@
147
497
        """
561
        """
148
498
        if isinstance(word, str):
562
        if isinstance(word, str):
149
499
            word = word.decode('utf-8', 'replace')
563
            word = word.decode('utf-8', 'replace')
157
500
564
        try:
158
501
        self.cursor.execute(
565
            # Check to see if we already have this word's info in memory.
159
502
            "SELECT word, nspam, nham FROM bayes WHERE word=?", (word,))
566
            row = self._readCache[word]
160
503
        rows = self.cursor.fetchall()
567
        except KeyError:
161
504
        if rows:
568
            # If not, load it from the database.
162
505
            return rows[0]
569
            self.cursor.execute(
163
506
        return None
570
                "SELECT word, nspam, nham FROM bayes WHERE word=?", (word,))
164
571
            rows = self.cursor.fetchall()
165
572
            if rows:
166
573
                # Add it to the cache and return it.
167
574
                self._readCache[rows[0][0]] = rows[0]
168
575
                return rows[0]
169
576
            return None
170
577
        else:
171
578
            # Otherwise return what we knew already.
172
579
            return row
173
507
580
174
508
581
175
509
    def _set(self, word, nspam, nham):
582
    def _set(self, word, nspam, nham):
176
@@ -519,10 +592,7 @@
177
519
        """
592
        """
178
520
        if isinstance(word, str):
593
        if isinstance(word, str):
179
521
            word = word.decode('utf-8', 'replace')
594
            word = word.decode('utf-8', 'replace')
184
522
        self.cursor.execute(
595
        self._readCache[word] = self._writeCache[word] = (word, nspam, nham)
181
523
            "INSERT OR REPLACE INTO bayes (word, nspam, nham) "
182
524
            "VALUES (?, ?, ?)",
183
525
            (word, nspam, nham))
185
526
596
186
527
597
187
528
    def _delete(self, word):
598
    def _delete(self, word):
188
@@ -532,10 +602,12 @@
189
532
        @param word: A word (or any other kind of token) to lose training
602
        @param word: A word (or any other kind of token) to lose training
190
533
            information about.
603
            information about.
191
534
        @type word: C{str} or C{unicode} (but really, C{unicode} please)
604
        @type word: C{str} or C{unicode} (but really, C{unicode} please)
192
605
193
606
        @raise NotImplementedError: Deletion is not actually supported in this
194
607
            backend.  Fortunately, Quotient does not need it (it never calls
195
608
            C{unlearn}).
196
535
        """
609
        """
200
536
        if isinstance(word, str):
610
        raise NotImplementedError("There is no support for deletion.")
198
537
            word = word.decode('utf-8', 'replace')
199
538
        self.cursor.execute("DELETE FROM bayes WHERE word=?", (word,))
201
539
611
202
540
612
203
541
    def _post_training(self):
613
    def _post_training(self):
204
@@ -545,6 +617,11 @@
205
545
        transaction, which contains all of the database modifications for each
617
        transaction, which contains all of the database modifications for each
206
546
        token in that message.
618
        token in that message.
207
547
        """
619
        """
208
620
        writes = self._writeCache.itervalues()
209
621
        self._writeCache = {}
210
622
        self.cursor.executemany(
211
623
            "INSERT OR REPLACE INTO bayes (word, nspam, nham) "
212
624
            "VALUES (?, ?, ?)", writes)
213
548
        self.db.commit()
625
        self.db.commit()
214
549
626
215
550
627
216
551
628
217
=== modified file 'Quotient/xquotient/test/test_spambayes.py'
218
--- Quotient/xquotient/test/test_spambayes.py	2012-08-20 19:53:37 +0000
219
+++ Quotient/xquotient/test/test_spambayes.py	2012-08-25 13:12:19 +0000
220
@@ -42,6 +42,54 @@
221
42
        self.assertEqual(bayes.nspam, 1)
42
        self.assertEqual(bayes.nspam, 1)
222
43
43
223
44
44
224
45
    def test_spamTokenRecorded(self):
225
46
        """
226
47
        The first time a token is encountered during spam training, a row is
227
48
        inserted into the database counting it as once a spam token, never a ham
228
49
        token.
229
50
        """
230
51
        self.classifier.train(StringIO("spam bad gross"), True)
231
52
        bayes = spam._SQLite3Classifier(self.path)
232
53
        wordInfo = bayes._get(u"spam")
233
54
        self.assertEqual((u"spam", 1, 0), wordInfo)
234
55
235
56
236
57
    def test_hamTokenRecorded(self):
237
58
        """
238
59
        The first time a token is encountered during ham training, a row is
239
60
        inserted into the database counting it as never a spam token, once a ham
240
61
        token.
241
62
        """
242
63
        self.classifier.train(StringIO("justice sunshine puppies"), False)
243
64
        bayes = spam._SQLite3Classifier(self.path)
244
65
        wordInfo = bayes._get(u"sunshine")
245
66
        self.assertEqual((u"sunshine", 0, 1), wordInfo)
246
67
247
68
248
69
    def test_spamTokenIncremented(self):
249
70
        """
250
71
        Encountered on a subsequent spam training operation, an existing word
251
72
        info row has its spam count incremented and its ham count left alone.
252
73
        """
253
74
        self.classifier.train(StringIO("justice sunshine puppies"), False)
254
75
        self.classifier.train(StringIO("spam bad puppies"), True)
255
76
        bayes = spam._SQLite3Classifier(self.path)
256
77
        wordInfo = bayes._get(u"puppies")
257
78
        self.assertEqual((u"puppies", 1, 1), wordInfo)
258
79
259
80
260
81
    def test_hamTokenIncremented(self):
261
82
        """
262
83
        Encountered on a subsequent ham training operation, an existing word
263
84
        info row has its spam count left alone and its ham count incremented.
264
85
        """
265
86
        self.classifier.train(StringIO("spam bad puppies"), True)
266
87
        self.classifier.train(StringIO("justice sunshine puppies"), False)
267
88
        bayes = spam._SQLite3Classifier(self.path)
268
89
        wordInfo = bayes._get(u"puppies")
269
90
        self.assertEqual((u"puppies", 1, 1), wordInfo)
270
91
271
92
272
45
    def test_nham(self):
93
    def test_nham(self):
273
46
        """
94
        """
274
47
        L{SQLite3Classifier} tracks, in memory, the number of ham messages it
95
        L{SQLite3Classifier} tracks, in memory, the number of ham messages it
275
@@ -71,6 +119,17 @@
276
71
            self.classifier.score(StringIO("spamfulness words of spam")) > 0.99)
119
            self.classifier.score(StringIO("spamfulness words of spam")) > 0.99)
277
72
120
278
73
121
279
122
    def test_spamClassificationWithoutCache(self):
280
123
        """
281
124
        Like L{test_spamClassification}, but ensure no instance cache is used to
282
125
        satisfied word info lookups.
283
126
        """
284
127
        self.classifier.train(StringIO("spam words of spamfulness"), True)
285
128
        classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
286
129
        self.assertTrue(
287
130
            classifier.score(StringIO("spamfulness words of spam")) > 0.99)
288
131
289
132
290
74
    def test_hamClassification(self):
133
    def test_hamClassification(self):
291
75
        """
134
        """
292
76
        L{SQLite3Classifier} can be trained with a ham message so as to later
135
        L{SQLite3Classifier} can be trained with a ham message so as to later
293
@@ -81,6 +140,34 @@
294
81
            self.classifier.score(StringIO("words, very nice")) < 0.01)
140
            self.classifier.score(StringIO("words, very nice")) < 0.01)
295
82
141
296
83
142
297
143
    def test_hamClassificationWithoutCache(self):
298
144
        """
299
145
        Like L{test_spamClassification}, but ensure no instance cache is used to
300
146
        satisfied word info lookups.
301
147
        """
302
148
        self.classifier.train(StringIO("very nice words"), False)
303
149
        classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
304
150
        self.assertTrue(
305
151
            classifier.score(StringIO("words, very nice")) < 0.01)
306
152
307
153
308
154
309
155
    def test_largeDocumentClassification(self):
310
156
        """
311
157
        A document with more than 999 tokens can be successfully classified.
312
158
        """
313
159
        words = []
314
160
        for i in range(1000):
315
161
            word = "word%d" % (i,)
316
162
            words.append(word)
317
163
        document = " ".join(words)
318
164
        self.classifier.train(StringIO(document), False)
319
165
320
166
        classifier = Hammie(spam._SQLite3Classifier(self.path), mode='r')
321
167
        self.assertTrue(
322
168
            classifier.score(StringIO(document)) < 0.01)
323
169
324
170
325
84
171
326
85
class SpambayesFilterTestCase(unittest.TestCase, MessageCreationMixin):
172
class SpambayesFilterTestCase(unittest.TestCase, MessageCreationMixin):
327
86
    """
173
    """
Reviewer	Review Type	Date Requested	Status
Tristan Seligmann		2012-08-23	Approve on 2012-09-13
Review via email: mp+121094@code.launchpad.net