Merge lp:~tristan-rivoallan/vanilla-miner/vm-598020 into lp:vanilla-miner

Proposed by Tristan Rivoallan
Status: Merged
Merge reported by: Tristan Rivoallan
Merged at revision: not available
Proposed branch: lp:~tristan-rivoallan/vanilla-miner/vm-598020
Merge into: lp:vanilla-miner
Diff against target: 479 lines (+314/-12)
12 files modified
config/doctrine/schema.yml (+21/-1)
data/sql/init_extraction_log.sql (+57/-0)
lib/filter/doctrine/ExtractionLogFormFilter.class.php (+16/-0)
lib/form/doctrine/ExtractionLogForm.class.php (+16/-0)
lib/migration/doctrine/1277385385_version2.php (+51/-0)
lib/migration/doctrine/1277387331_version3.php (+23/-0)
lib/migration/doctrine/1277393030_version4.php (+20/-0)
lib/model/doctrine/ExtractionLog.class.php (+15/-0)
lib/model/doctrine/ExtractionLogTable.class.php (+11/-0)
lib/task/minerExtractlinksTask.class.php (+65/-2)
lib/vendor/CI/Extractor.php (+8/-7)
lib/vendor/CI/Extractor/LussumoVanilla1.php (+11/-2)
To merge this branch: bzr merge lp:~tristan-rivoallan/vanilla-miner/vm-598020
Reviewer Review Type Date Requested Status
Tristan Rivoallan Approve
Review via email: mp+28429@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Tristan Rivoallan (tristan-rivoallan) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'config/doctrine/schema.yml'
2--- config/doctrine/schema.yml 2010-06-23 20:45:16 +0000
3+++ config/doctrine/schema.yml 2010-06-24 16:15:42 +0000
4@@ -48,4 +48,24 @@
5 idx_availability:
6 fields:
7 availability:
8- length: 11
9\ No newline at end of file
10+ length: 11
11+
12+ExtractionLog:
13+ actAs:
14+ Timestampable:
15+ columns:
16+ id:
17+ type: integer
18+ autoincrement: true
19+ primary: true
20+ notnull: true
21+ extraction_driver:
22+ type: string
23+ started_on:
24+ type: timestamp
25+ finished_on:
26+ type: timestamp
27+ resources_parsed:
28+ type: int
29+ urls_extracted:
30+ type: int
31\ No newline at end of file
32
33=== added file 'data/sql/init_extraction_log.sql'
34--- data/sql/init_extraction_log.sql 1970-01-01 00:00:00 +0000
35+++ data/sql/init_extraction_log.sql 2010-06-24 16:15:42 +0000
36@@ -0,0 +1,57 @@
37+-- MySQL dump 10.13 Distrib 5.1.41, for debian-linux-gnu (i486)
38+--
39+-- Host: localhost Database: miner
40+-- ------------------------------------------------------
41+-- Server version 5.1.41-3ubuntu12.3
42+
43+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
44+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
45+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
46+/*!40101 SET NAMES utf8 */;
47+/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
48+/*!40103 SET TIME_ZONE='+00:00' */;
49+/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
50+/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
51+/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
52+/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
53+
54+--
55+-- Table structure for table `extraction_log`
56+--
57+
58+DROP TABLE IF EXISTS `extraction_log`;
59+/*!40101 SET @saved_cs_client = @@character_set_client */;
60+/*!40101 SET character_set_client = utf8 */;
61+CREATE TABLE `extraction_log` (
62+ `id` bigint(20) NOT NULL AUTO_INCREMENT,
63+ `extraction_driver` text,
64+ `created_at` datetime NOT NULL,
65+ `updated_at` datetime NOT NULL,
66+ `started_on` datetime DEFAULT NULL,
67+ `finished_on` datetime DEFAULT NULL,
68+ `resources_parsed` int(11) DEFAULT NULL,
69+ `urls_extracted` int(11) DEFAULT NULL,
70+ PRIMARY KEY (`id`)
71+) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=latin1;
72+/*!40101 SET character_set_client = @saved_cs_client */;
73+
74+--
75+-- Dumping data for table `extraction_log`
76+--
77+
78+LOCK TABLES `extraction_log` WRITE;
79+/*!40000 ALTER TABLE `extraction_log` DISABLE KEYS */;
80+INSERT INTO `extraction_log` VALUES (1,'CI_Extractor_LussumoVanilla1','2010-06-23 12:00:00','2010-06-23 12:00:00','2010-06-23 12:00:00',NULL,32176,16535);
81+/*!40000 ALTER TABLE `extraction_log` ENABLE KEYS */;
82+UNLOCK TABLES;
83+/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
84+
85+/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
86+/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
87+/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
88+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
89+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
90+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
91+/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
92+
93+-- Dump completed on 2010-06-24 17:40:00
94
95=== added file 'lib/filter/doctrine/ExtractionLogFormFilter.class.php'
96--- lib/filter/doctrine/ExtractionLogFormFilter.class.php 1970-01-01 00:00:00 +0000
97+++ lib/filter/doctrine/ExtractionLogFormFilter.class.php 2010-06-24 16:15:42 +0000
98@@ -0,0 +1,16 @@
99+<?php
100+
101+/**
102+ * ExtractionLog filter form.
103+ *
104+ * @package vanilla-miner
105+ * @subpackage filter
106+ * @author Constructions Incongrues
107+ * @version SVN: $Id: sfDoctrineFormFilterTemplate.php 23810 2009-11-12 11:07:44Z Kris.Wallsmith $
108+ */
109+class ExtractionLogFormFilter extends BaseExtractionLogFormFilter
110+{
111+ public function configure()
112+ {
113+ }
114+}
115
116=== added file 'lib/form/doctrine/ExtractionLogForm.class.php'
117--- lib/form/doctrine/ExtractionLogForm.class.php 1970-01-01 00:00:00 +0000
118+++ lib/form/doctrine/ExtractionLogForm.class.php 2010-06-24 16:15:42 +0000
119@@ -0,0 +1,16 @@
120+<?php
121+
122+/**
123+ * ExtractionLog form.
124+ *
125+ * @package vanilla-miner
126+ * @subpackage form
127+ * @author Constructions Incongrues
128+ * @version SVN: $Id: sfDoctrineFormTemplate.php 23810 2009-11-12 11:07:44Z Kris.Wallsmith $
129+ */
130+class ExtractionLogForm extends BaseExtractionLogForm
131+{
132+ public function configure()
133+ {
134+ }
135+}
136
137=== added file 'lib/migration/doctrine/1277385385_version2.php'
138--- lib/migration/doctrine/1277385385_version2.php 1970-01-01 00:00:00 +0000
139+++ lib/migration/doctrine/1277385385_version2.php 2010-06-24 16:15:42 +0000
140@@ -0,0 +1,51 @@
141+<?php
142+/**
143+ * This class has been auto-generated by the Doctrine ORM Framework
144+ */
145+class Version2 extends Doctrine_Migration_Base
146+{
147+ public function up()
148+ {
149+ $this->createTable('extraction_log', array(
150+ 'id' =>
151+ array(
152+ 'type' => 'integer',
153+ 'autoincrement' => '1',
154+ 'primary' => '1',
155+ 'length' => '8',
156+ ),
157+ 'extraction_driver' =>
158+ array(
159+ 'type' => 'string',
160+ 'length' => '',
161+ ),
162+ 'duration' =>
163+ array(
164+ 'type' => 'int',
165+ 'length' => '',
166+ ),
167+ 'created_at' =>
168+ array(
169+ 'notnull' => '1',
170+ 'type' => 'timestamp',
171+ 'length' => '25',
172+ ),
173+ 'updated_at' =>
174+ array(
175+ 'notnull' => '1',
176+ 'type' => 'timestamp',
177+ 'length' => '25',
178+ ),
179+ ), array(
180+ 'primary' =>
181+ array(
182+ 0 => 'id',
183+ ),
184+ ));
185+ }
186+
187+ public function down()
188+ {
189+ $this->dropTable('extraction_log');
190+ }
191+}
192\ No newline at end of file
193
194=== added file 'lib/migration/doctrine/1277387331_version3.php'
195--- lib/migration/doctrine/1277387331_version3.php 1970-01-01 00:00:00 +0000
196+++ lib/migration/doctrine/1277387331_version3.php 2010-06-24 16:15:42 +0000
197@@ -0,0 +1,23 @@
198+<?php
199+/**
200+ * This class has been auto-generated by the Doctrine ORM Framework
201+ */
202+class Version3 extends Doctrine_Migration_Base
203+{
204+ public function up()
205+ {
206+ $this->removeColumn('extraction_log', 'duration');
207+ $this->addColumn('extraction_log', 'started_on', 'timestamp', '25', array(
208+ ));
209+ $this->addColumn('extraction_log', 'finished_on', 'timestamp', '25', array(
210+ ));
211+ }
212+
213+ public function down()
214+ {
215+ $this->addColumn('extraction_log', 'duration', 'int', '', array(
216+ ));
217+ $this->removeColumn('extraction_log', 'started_on');
218+ $this->removeColumn('extraction_log', 'finished_on');
219+ }
220+}
221\ No newline at end of file
222
223=== added file 'lib/migration/doctrine/1277393030_version4.php'
224--- lib/migration/doctrine/1277393030_version4.php 1970-01-01 00:00:00 +0000
225+++ lib/migration/doctrine/1277393030_version4.php 2010-06-24 16:15:42 +0000
226@@ -0,0 +1,20 @@
227+<?php
228+/**
229+ * This class has been auto-generated by the Doctrine ORM Framework
230+ */
231+class Version4 extends Doctrine_Migration_Base
232+{
233+ public function up()
234+ {
235+ $this->addColumn('extraction_log', 'resources_parsed', 'int', '', array(
236+ ));
237+ $this->addColumn('extraction_log', 'urls_extracted', 'int', '', array(
238+ ));
239+ }
240+
241+ public function down()
242+ {
243+ $this->removeColumn('extraction_log', 'resources_parsed');
244+ $this->removeColumn('extraction_log', 'urls_extracted');
245+ }
246+}
247\ No newline at end of file
248
249=== added file 'lib/model/doctrine/ExtractionLog.class.php'
250--- lib/model/doctrine/ExtractionLog.class.php 1970-01-01 00:00:00 +0000
251+++ lib/model/doctrine/ExtractionLog.class.php 2010-06-24 16:15:42 +0000
252@@ -0,0 +1,15 @@
253+<?php
254+
255+/**
256+ * ExtractionLog
257+ *
258+ * This class has been auto-generated by the Doctrine ORM Framework
259+ *
260+ * @package vanilla-miner
261+ * @subpackage model
262+ * @author Constructions Incongrues
263+ * @version SVN: $Id: Builder.php 7490 2010-03-29 19:53:27Z jwage $
264+ */
265+class ExtractionLog extends BaseExtractionLog
266+{
267+}
268
269=== added file 'lib/model/doctrine/ExtractionLogTable.class.php'
270--- lib/model/doctrine/ExtractionLogTable.class.php 1970-01-01 00:00:00 +0000
271+++ lib/model/doctrine/ExtractionLogTable.class.php 2010-06-24 16:15:42 +0000
272@@ -0,0 +1,11 @@
273+<?php
274+
275+
276+class ExtractionLogTable extends Doctrine_Table
277+{
278+
279+ public static function getInstance()
280+ {
281+ return Doctrine_Core::getTable('ExtractionLog');
282+ }
283+}
284\ No newline at end of file
285
286=== modified file 'lib/task/minerExtractlinksTask.class.php'
287--- lib/task/minerExtractlinksTask.class.php 2010-06-23 16:09:40 +0000
288+++ lib/task/minerExtractlinksTask.class.php 2010-06-24 16:15:42 +0000
289@@ -20,6 +20,7 @@
290 new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'),
291 new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'),
292 new sfCommandOption('extraction-driver', null, sfCommandOption::PARAMETER_REQUIRED, 'Extraction driver class name', 'CI_Extractor_LussumoVanilla1'),
293+ new sfCommandOption('incremental', null, sfCommandOption::PARAMETER_REQUIRED, 'If true, only extracts URLs from new and updated resources since last extraction', true),
294 new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Displays a progress bar'),
295 ));
296
297@@ -55,8 +56,38 @@
298 throw new InvalidArgumentException(sprintf('Class "%s" does not exist', $options['extraction-driver']));
299 }
300
301+ if ($options['incremental'] === 'false')
302+ {
303+ $options['incremental'] = false;
304+ }
305+
306+ // Instanciate database connection
307+ $databaseManager = new sfDatabaseManager($this->configuration);
308+ $connection = $databaseManager->getDatabase($options['connection'])->getConnection();
309+
310+ // If extraction is incremental, retrieve last extraction date
311+ $since = null;
312+ if ($options['incremental'])
313+ {
314+ $since = $this->getLastExtractionDate($connection);
315+ if ($since)
316+ {
317+ $this->logSection('extract', sprintf('Incrementally extracting URLs from resources created or updated since "%s"', $since));
318+ }
319+ else
320+ {
321+ $this->logSection('extract', 'Found no entries in extraction log. Extracting URLs from all resources.');
322+ }
323+ }
324+
325+ // Create new extraction log entry
326+ $log_entry = new ExtractionLog();
327+ $log_entry->extraction_driver = $options['extraction-driver'];
328+ $log_entry->started_on = date('Y-m-d H:i:s');
329+ $log_entry->save();
330+
331 // Instanciate and configure extraction engine
332- $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration);
333+ $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration, $since);
334
335 // Extraction statistics
336 $urls_found_count = 0;
337@@ -76,11 +107,16 @@
338 }
339
340 // Extract resources from source and insert them in Links database
341- while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection']))
342+ while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'], $since))
343 {
344 // Update extraction statistics
345 $urls_found_count += $resource_extraction_info['urls_found_count'];
346
347+ // Update extraction log
348+ $log_entry->resources_parsed = $resource_extraction_info['resources_parsed_count'];
349+ $log_entry->urls_extracted = $urls_found_count;
350+ $log_entry->save();
351+
352 // Update progress bar
353 if ($options['progress'])
354 {
355@@ -95,6 +131,33 @@
356 {
357 $this->logSection('extract', 'No resources to extract. Exiting.');
358 }
359+
360+ // Record finish time
361+ $log_entry->finished_on = date('Y-m-d H:i:s');
362+ $log_entry->save();
363+ }
364+
365+ /**
366+ * Returns date of most recent extraction.
367+ *
368+ * @param string $doctrine_connection
369+ */
370+ private function getLastExtractionDate()
371+ {
372+ // Retrieve last extraction date
373+ $last_extraction_date = Doctrine_Query::create()
374+ ->select('l.started_on')
375+ ->from('ExtractionLog l')
376+ ->orderBy('l.started_on desc')
377+ ->limit(1)
378+ ->execute(null, Doctrine_Core::HYDRATE_SINGLE_SCALAR);
379+
380+ if (!$last_extraction_date)
381+ {
382+ $last_extraction_date = null;
383+ }
384+
385+ return $last_extraction_date;
386 }
387
388 /**
389
390=== modified file 'lib/vendor/CI/Extractor.php'
391--- lib/vendor/CI/Extractor.php 2010-06-23 09:28:02 +0000
392+++ lib/vendor/CI/Extractor.php 2010-06-24 16:15:42 +0000
393@@ -13,7 +13,7 @@
394 *
395 * @return array
396 */
397- abstract protected function getResources($dsn_source);
398+ abstract protected function getResources($dsn_source, $sources = null);
399
400 /**
401 * Returns text from which URLs will be extracted.
402@@ -39,7 +39,7 @@
403 *
404 * @return int
405 */
406- abstract protected function countResources($dsn_source);
407+ abstract protected function countResources($dsn_source, $since = null);
408
409 /**
410 * @var array
411@@ -69,7 +69,7 @@
412 /**
413 * Instanciates and configures extractor.
414 *
415- * @param sfEventDispatcher $event_dispatcher
416+ * @param sfEventDispatcher $event_dispatcher
417 * @param sfProjectConfiguration $configuration
418 */
419 public function __construct(sfEventDispatcher $event_dispatcher, sfProjectConfiguration $configuration)
420@@ -115,15 +115,16 @@
421 /**
422 * Extracts links from source database and inserts them into links collection.
423 *
424- * @param string $dsn_source
425- * @param sfDoctrineConnection $connection_dest
426+ * @param string $dsn_source
427+ * @param string $connection_dest
428+ * @param string $since Extract urls from resources updated or create since this date (Y-m-d H:i:s)
429 */
430- public function extract($dsn_source, $connection_dest)
431+ public function extract($dsn_source, $connection_dest, $since = null)
432 {
433 if (!$this->resources)
434 {
435 // Retrieve comments from database
436- $resources = $this->getResources($dsn_source);
437+ $resources = $this->getResources($dsn_source, $since);
438 $this->log(sprintf('Extracting URLs from %d resources using extractor "%s"', count($resources), get_class($this)));
439 $this->resources = $resources;
440 }
441
442=== modified file 'lib/vendor/CI/Extractor/LussumoVanilla1.php'
443--- lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-23 10:13:43 +0000
444+++ lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-24 16:15:42 +0000
445@@ -15,13 +15,18 @@
446 *
447 * NEXT : make table prefix configurable
448 */
449- protected function getResources($dsn_source)
450+ protected function getResources($dsn_source, $since = null)
451 {
452 $q = 'select c.CommentID, c.Body, c.DateCreated, c.AuthUserID, c.DiscussionID, d.Name as DiscussionName, u.Name
453 from LUM_Comment c
454 inner join LUM_User u on c.AuthUserID = u.UserID
455 inner join LUM_Discussion d on c.DiscussionID = d.DiscussionID
456 where c.Deleted != 1 and c.WhisperUserID = 0';
457+ if ($since)
458+ {
459+ $q .= sprintf(' and c.DateCreated > "%s"', $since);
460+ }
461+
462
463 return $this->getConnection($dsn_source)->fetchAssoc($q);
464 }
465@@ -74,9 +79,13 @@
466 *
467 * @return int
468 */
469- public function countResources($dsn)
470+ public function countResources($dsn, $since = null)
471 {
472 $q = 'select count(c.CommentID) from LUM_Comment c where c.Deleted != 1 and c.WhisperUserID = 0';
473+ if ($since)
474+ {
475+ $q .= sprintf(' where c.DateCreated > "%s"', $since);
476+ }
477
478 return (int)$this->getConnection($dsn)->fetchOne($q);
479 }

Subscribers

People subscribed via source and target branches

to all changes: