Merge lp:~tristan-rivoallan/vanilla-miner/vm-598020 into lp:vanilla-miner
- vm-598020
- Merge into trunk
Proposed by
Tristan Rivoallan
Status: | Merged | ||||
---|---|---|---|---|---|
Merge reported by: | Tristan Rivoallan | ||||
Merged at revision: | not available | ||||
Proposed branch: | lp:~tristan-rivoallan/vanilla-miner/vm-598020 | ||||
Merge into: | lp:vanilla-miner | ||||
Diff against target: |
479 lines (+314/-12) 12 files modified
config/doctrine/schema.yml (+21/-1) data/sql/init_extraction_log.sql (+57/-0) lib/filter/doctrine/ExtractionLogFormFilter.class.php (+16/-0) lib/form/doctrine/ExtractionLogForm.class.php (+16/-0) lib/migration/doctrine/1277385385_version2.php (+51/-0) lib/migration/doctrine/1277387331_version3.php (+23/-0) lib/migration/doctrine/1277393030_version4.php (+20/-0) lib/model/doctrine/ExtractionLog.class.php (+15/-0) lib/model/doctrine/ExtractionLogTable.class.php (+11/-0) lib/task/minerExtractlinksTask.class.php (+65/-2) lib/vendor/CI/Extractor.php (+8/-7) lib/vendor/CI/Extractor/LussumoVanilla1.php (+11/-2) |
||||
To merge this branch: | bzr merge lp:~tristan-rivoallan/vanilla-miner/vm-598020 | ||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Tristan Rivoallan | Approve | ||
Review via email: mp+28429@code.launchpad.net |
Commit message
Description of the change
To post a comment you must log in.
Revision history for this message
Tristan Rivoallan (tristan-rivoallan) : | # |
review:
Approve
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'config/doctrine/schema.yml' |
2 | --- config/doctrine/schema.yml 2010-06-23 20:45:16 +0000 |
3 | +++ config/doctrine/schema.yml 2010-06-24 16:15:42 +0000 |
4 | @@ -48,4 +48,24 @@ |
5 | idx_availability: |
6 | fields: |
7 | availability: |
8 | - length: 11 |
9 | \ No newline at end of file |
10 | + length: 11 |
11 | + |
12 | +ExtractionLog: |
13 | + actAs: |
14 | + Timestampable: |
15 | + columns: |
16 | + id: |
17 | + type: integer |
18 | + autoincrement: true |
19 | + primary: true |
20 | + notnull: true |
21 | + extraction_driver: |
22 | + type: string |
23 | + started_on: |
24 | + type: timestamp |
25 | + finished_on: |
26 | + type: timestamp |
27 | + resources_parsed: |
28 | + type: int |
29 | + urls_extracted: |
30 | + type: int |
31 | \ No newline at end of file |
32 | |
33 | === added file 'data/sql/init_extraction_log.sql' |
34 | --- data/sql/init_extraction_log.sql 1970-01-01 00:00:00 +0000 |
35 | +++ data/sql/init_extraction_log.sql 2010-06-24 16:15:42 +0000 |
36 | @@ -0,0 +1,57 @@ |
37 | +-- MySQL dump 10.13 Distrib 5.1.41, for debian-linux-gnu (i486) |
38 | +-- |
39 | +-- Host: localhost Database: miner |
40 | +-- ------------------------------------------------------ |
41 | +-- Server version 5.1.41-3ubuntu12.3 |
42 | + |
43 | +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; |
44 | +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; |
45 | +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; |
46 | +/*!40101 SET NAMES utf8 */; |
47 | +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; |
48 | +/*!40103 SET TIME_ZONE='+00:00' */; |
49 | +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; |
50 | +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; |
51 | +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; |
52 | +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; |
53 | + |
54 | +-- |
55 | +-- Table structure for table `extraction_log` |
56 | +-- |
57 | + |
58 | +DROP TABLE IF EXISTS `extraction_log`; |
59 | +/*!40101 SET @saved_cs_client = @@character_set_client */; |
60 | +/*!40101 SET character_set_client = utf8 */; |
61 | +CREATE TABLE `extraction_log` ( |
62 | + `id` bigint(20) NOT NULL AUTO_INCREMENT, |
63 | + `extraction_driver` text, |
64 | + `created_at` datetime NOT NULL, |
65 | + `updated_at` datetime NOT NULL, |
66 | + `started_on` datetime DEFAULT NULL, |
67 | + `finished_on` datetime DEFAULT NULL, |
68 | + `resources_parsed` int(11) DEFAULT NULL, |
69 | + `urls_extracted` int(11) DEFAULT NULL, |
70 | + PRIMARY KEY (`id`) |
71 | +) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=latin1; |
72 | +/*!40101 SET character_set_client = @saved_cs_client */; |
73 | + |
74 | +-- |
75 | +-- Dumping data for table `extraction_log` |
76 | +-- |
77 | + |
78 | +LOCK TABLES `extraction_log` WRITE; |
79 | +/*!40000 ALTER TABLE `extraction_log` DISABLE KEYS */; |
80 | +INSERT INTO `extraction_log` VALUES (1,'CI_Extractor_LussumoVanilla1','2010-06-23 12:00:00','2010-06-23 12:00:00','2010-06-23 12:00:00',NULL,32176,16535); |
81 | +/*!40000 ALTER TABLE `extraction_log` ENABLE KEYS */; |
82 | +UNLOCK TABLES; |
83 | +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; |
84 | + |
85 | +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; |
86 | +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; |
87 | +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; |
88 | +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; |
89 | +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; |
90 | +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; |
91 | +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; |
92 | + |
93 | +-- Dump completed on 2010-06-24 17:40:00 |
94 | |
95 | === added file 'lib/filter/doctrine/ExtractionLogFormFilter.class.php' |
96 | --- lib/filter/doctrine/ExtractionLogFormFilter.class.php 1970-01-01 00:00:00 +0000 |
97 | +++ lib/filter/doctrine/ExtractionLogFormFilter.class.php 2010-06-24 16:15:42 +0000 |
98 | @@ -0,0 +1,16 @@ |
99 | +<?php |
100 | + |
101 | +/** |
102 | + * ExtractionLog filter form. |
103 | + * |
104 | + * @package vanilla-miner |
105 | + * @subpackage filter |
106 | + * @author Constructions Incongrues |
107 | + * @version SVN: $Id: sfDoctrineFormFilterTemplate.php 23810 2009-11-12 11:07:44Z Kris.Wallsmith $ |
108 | + */ |
109 | +class ExtractionLogFormFilter extends BaseExtractionLogFormFilter |
110 | +{ |
111 | + public function configure() |
112 | + { |
113 | + } |
114 | +} |
115 | |
116 | === added file 'lib/form/doctrine/ExtractionLogForm.class.php' |
117 | --- lib/form/doctrine/ExtractionLogForm.class.php 1970-01-01 00:00:00 +0000 |
118 | +++ lib/form/doctrine/ExtractionLogForm.class.php 2010-06-24 16:15:42 +0000 |
119 | @@ -0,0 +1,16 @@ |
120 | +<?php |
121 | + |
122 | +/** |
123 | + * ExtractionLog form. |
124 | + * |
125 | + * @package vanilla-miner |
126 | + * @subpackage form |
127 | + * @author Constructions Incongrues |
128 | + * @version SVN: $Id: sfDoctrineFormTemplate.php 23810 2009-11-12 11:07:44Z Kris.Wallsmith $ |
129 | + */ |
130 | +class ExtractionLogForm extends BaseExtractionLogForm |
131 | +{ |
132 | + public function configure() |
133 | + { |
134 | + } |
135 | +} |
136 | |
137 | === added file 'lib/migration/doctrine/1277385385_version2.php' |
138 | --- lib/migration/doctrine/1277385385_version2.php 1970-01-01 00:00:00 +0000 |
139 | +++ lib/migration/doctrine/1277385385_version2.php 2010-06-24 16:15:42 +0000 |
140 | @@ -0,0 +1,51 @@ |
141 | +<?php |
142 | +/** |
143 | + * This class has been auto-generated by the Doctrine ORM Framework |
144 | + */ |
145 | +class Version2 extends Doctrine_Migration_Base |
146 | +{ |
147 | + public function up() |
148 | + { |
149 | + $this->createTable('extraction_log', array( |
150 | + 'id' => |
151 | + array( |
152 | + 'type' => 'integer', |
153 | + 'autoincrement' => '1', |
154 | + 'primary' => '1', |
155 | + 'length' => '8', |
156 | + ), |
157 | + 'extraction_driver' => |
158 | + array( |
159 | + 'type' => 'string', |
160 | + 'length' => '', |
161 | + ), |
162 | + 'duration' => |
163 | + array( |
164 | + 'type' => 'int', |
165 | + 'length' => '', |
166 | + ), |
167 | + 'created_at' => |
168 | + array( |
169 | + 'notnull' => '1', |
170 | + 'type' => 'timestamp', |
171 | + 'length' => '25', |
172 | + ), |
173 | + 'updated_at' => |
174 | + array( |
175 | + 'notnull' => '1', |
176 | + 'type' => 'timestamp', |
177 | + 'length' => '25', |
178 | + ), |
179 | + ), array( |
180 | + 'primary' => |
181 | + array( |
182 | + 0 => 'id', |
183 | + ), |
184 | + )); |
185 | + } |
186 | + |
187 | + public function down() |
188 | + { |
189 | + $this->dropTable('extraction_log'); |
190 | + } |
191 | +} |
192 | \ No newline at end of file |
193 | |
194 | === added file 'lib/migration/doctrine/1277387331_version3.php' |
195 | --- lib/migration/doctrine/1277387331_version3.php 1970-01-01 00:00:00 +0000 |
196 | +++ lib/migration/doctrine/1277387331_version3.php 2010-06-24 16:15:42 +0000 |
197 | @@ -0,0 +1,23 @@ |
198 | +<?php |
199 | +/** |
200 | + * This class has been auto-generated by the Doctrine ORM Framework |
201 | + */ |
202 | +class Version3 extends Doctrine_Migration_Base |
203 | +{ |
204 | + public function up() |
205 | + { |
206 | + $this->removeColumn('extraction_log', 'duration'); |
207 | + $this->addColumn('extraction_log', 'started_on', 'timestamp', '25', array( |
208 | + )); |
209 | + $this->addColumn('extraction_log', 'finished_on', 'timestamp', '25', array( |
210 | + )); |
211 | + } |
212 | + |
213 | + public function down() |
214 | + { |
215 | + $this->addColumn('extraction_log', 'duration', 'int', '', array( |
216 | + )); |
217 | + $this->removeColumn('extraction_log', 'started_on'); |
218 | + $this->removeColumn('extraction_log', 'finished_on'); |
219 | + } |
220 | +} |
221 | \ No newline at end of file |
222 | |
223 | === added file 'lib/migration/doctrine/1277393030_version4.php' |
224 | --- lib/migration/doctrine/1277393030_version4.php 1970-01-01 00:00:00 +0000 |
225 | +++ lib/migration/doctrine/1277393030_version4.php 2010-06-24 16:15:42 +0000 |
226 | @@ -0,0 +1,20 @@ |
227 | +<?php |
228 | +/** |
229 | + * This class has been auto-generated by the Doctrine ORM Framework |
230 | + */ |
231 | +class Version4 extends Doctrine_Migration_Base |
232 | +{ |
233 | + public function up() |
234 | + { |
235 | + $this->addColumn('extraction_log', 'resources_parsed', 'int', '', array( |
236 | + )); |
237 | + $this->addColumn('extraction_log', 'urls_extracted', 'int', '', array( |
238 | + )); |
239 | + } |
240 | + |
241 | + public function down() |
242 | + { |
243 | + $this->removeColumn('extraction_log', 'resources_parsed'); |
244 | + $this->removeColumn('extraction_log', 'urls_extracted'); |
245 | + } |
246 | +} |
247 | \ No newline at end of file |
248 | |
249 | === added file 'lib/model/doctrine/ExtractionLog.class.php' |
250 | --- lib/model/doctrine/ExtractionLog.class.php 1970-01-01 00:00:00 +0000 |
251 | +++ lib/model/doctrine/ExtractionLog.class.php 2010-06-24 16:15:42 +0000 |
252 | @@ -0,0 +1,15 @@ |
253 | +<?php |
254 | + |
255 | +/** |
256 | + * ExtractionLog |
257 | + * |
258 | + * This class has been auto-generated by the Doctrine ORM Framework |
259 | + * |
260 | + * @package vanilla-miner |
261 | + * @subpackage model |
262 | + * @author Constructions Incongrues |
263 | + * @version SVN: $Id: Builder.php 7490 2010-03-29 19:53:27Z jwage $ |
264 | + */ |
265 | +class ExtractionLog extends BaseExtractionLog |
266 | +{ |
267 | +} |
268 | |
269 | === added file 'lib/model/doctrine/ExtractionLogTable.class.php' |
270 | --- lib/model/doctrine/ExtractionLogTable.class.php 1970-01-01 00:00:00 +0000 |
271 | +++ lib/model/doctrine/ExtractionLogTable.class.php 2010-06-24 16:15:42 +0000 |
272 | @@ -0,0 +1,11 @@ |
273 | +<?php |
274 | + |
275 | + |
276 | +class ExtractionLogTable extends Doctrine_Table |
277 | +{ |
278 | + |
279 | + public static function getInstance() |
280 | + { |
281 | + return Doctrine_Core::getTable('ExtractionLog'); |
282 | + } |
283 | +} |
284 | \ No newline at end of file |
285 | |
286 | === modified file 'lib/task/minerExtractlinksTask.class.php' |
287 | --- lib/task/minerExtractlinksTask.class.php 2010-06-23 16:09:40 +0000 |
288 | +++ lib/task/minerExtractlinksTask.class.php 2010-06-24 16:15:42 +0000 |
289 | @@ -20,6 +20,7 @@ |
290 | new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'), |
291 | new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'), |
292 | new sfCommandOption('extraction-driver', null, sfCommandOption::PARAMETER_REQUIRED, 'Extraction driver class name', 'CI_Extractor_LussumoVanilla1'), |
293 | + new sfCommandOption('incremental', null, sfCommandOption::PARAMETER_REQUIRED, 'If true, only extracts URLs from new and updated resources since last extraction', true), |
294 | new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Displays a progress bar'), |
295 | )); |
296 | |
297 | @@ -55,8 +56,38 @@ |
298 | throw new InvalidArgumentException(sprintf('Class "%s" does not exist', $options['extraction-driver'])); |
299 | } |
300 | |
301 | + if ($options['incremental'] === 'false') |
302 | + { |
303 | + $options['incremental'] = false; |
304 | + } |
305 | + |
306 | + // Instanciate database connection |
307 | + $databaseManager = new sfDatabaseManager($this->configuration); |
308 | + $connection = $databaseManager->getDatabase($options['connection'])->getConnection(); |
309 | + |
310 | + // If extraction is incremental, retrieve last extraction date |
311 | + $since = null; |
312 | + if ($options['incremental']) |
313 | + { |
314 | + $since = $this->getLastExtractionDate($connection); |
315 | + if ($since) |
316 | + { |
317 | + $this->logSection('extract', sprintf('Incrementally extracting URLs from resources created or updated since "%s"', $since)); |
318 | + } |
319 | + else |
320 | + { |
321 | + $this->logSection('extract', 'Found no entries in extraction log. Extracting URLs from all resources.'); |
322 | + } |
323 | + } |
324 | + |
325 | + // Create new extraction log entry |
326 | + $log_entry = new ExtractionLog(); |
327 | + $log_entry->extraction_driver = $options['extraction-driver']; |
328 | + $log_entry->started_on = date('Y-m-d H:i:s'); |
329 | + $log_entry->save(); |
330 | + |
331 | // Instanciate and configure extraction engine |
332 | - $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration); |
333 | + $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration, $since); |
334 | |
335 | // Extraction statistics |
336 | $urls_found_count = 0; |
337 | @@ -76,11 +107,16 @@ |
338 | } |
339 | |
340 | // Extract resources from source and insert them in Links database |
341 | - while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'])) |
342 | + while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'], $since)) |
343 | { |
344 | // Update extraction statistics |
345 | $urls_found_count += $resource_extraction_info['urls_found_count']; |
346 | |
347 | + // Update extraction log |
348 | + $log_entry->resources_parsed = $resource_extraction_info['resources_parsed_count']; |
349 | + $log_entry->urls_extracted = $urls_found_count; |
350 | + $log_entry->save(); |
351 | + |
352 | // Update progress bar |
353 | if ($options['progress']) |
354 | { |
355 | @@ -95,6 +131,33 @@ |
356 | { |
357 | $this->logSection('extract', 'No resources to extract. Exiting.'); |
358 | } |
359 | + |
360 | + // Record finish time |
361 | + $log_entry->finished_on = date('Y-m-d H:i:s'); |
362 | + $log_entry->save(); |
363 | + } |
364 | + |
365 | + /** |
366 | + * Returns date of most recent extraction. |
367 | + * |
368 | + * @param string $doctrine_connection |
369 | + */ |
370 | + private function getLastExtractionDate() |
371 | + { |
372 | + // Retrieve last extraction date |
373 | + $last_extraction_date = Doctrine_Query::create() |
374 | + ->select('l.started_on') |
375 | + ->from('ExtractionLog l') |
376 | + ->orderBy('l.started_on desc') |
377 | + ->limit(1) |
378 | + ->execute(null, Doctrine_Core::HYDRATE_SINGLE_SCALAR); |
379 | + |
380 | + if (!$last_extraction_date) |
381 | + { |
382 | + $last_extraction_date = null; |
383 | + } |
384 | + |
385 | + return $last_extraction_date; |
386 | } |
387 | |
388 | /** |
389 | |
390 | === modified file 'lib/vendor/CI/Extractor.php' |
391 | --- lib/vendor/CI/Extractor.php 2010-06-23 09:28:02 +0000 |
392 | +++ lib/vendor/CI/Extractor.php 2010-06-24 16:15:42 +0000 |
393 | @@ -13,7 +13,7 @@ |
394 | * |
395 | * @return array |
396 | */ |
397 | - abstract protected function getResources($dsn_source); |
398 | + abstract protected function getResources($dsn_source, $sources = null); |
399 | |
400 | /** |
401 | * Returns text from which URLs will be extracted. |
402 | @@ -39,7 +39,7 @@ |
403 | * |
404 | * @return int |
405 | */ |
406 | - abstract protected function countResources($dsn_source); |
407 | + abstract protected function countResources($dsn_source, $since = null); |
408 | |
409 | /** |
410 | * @var array |
411 | @@ -69,7 +69,7 @@ |
412 | /** |
413 | * Instanciates and configures extractor. |
414 | * |
415 | - * @param sfEventDispatcher $event_dispatcher |
416 | + * @param sfEventDispatcher $event_dispatcher |
417 | * @param sfProjectConfiguration $configuration |
418 | */ |
419 | public function __construct(sfEventDispatcher $event_dispatcher, sfProjectConfiguration $configuration) |
420 | @@ -115,15 +115,16 @@ |
421 | /** |
422 | * Extracts links from source database and inserts them into links collection. |
423 | * |
424 | - * @param string $dsn_source |
425 | - * @param sfDoctrineConnection $connection_dest |
426 | + * @param string $dsn_source |
427 | + * @param string $connection_dest |
428 | + * @param string $since Extract urls from resources updated or create since this date (Y-m-d H:i:s) |
429 | */ |
430 | - public function extract($dsn_source, $connection_dest) |
431 | + public function extract($dsn_source, $connection_dest, $since = null) |
432 | { |
433 | if (!$this->resources) |
434 | { |
435 | // Retrieve comments from database |
436 | - $resources = $this->getResources($dsn_source); |
437 | + $resources = $this->getResources($dsn_source, $since); |
438 | $this->log(sprintf('Extracting URLs from %d resources using extractor "%s"', count($resources), get_class($this))); |
439 | $this->resources = $resources; |
440 | } |
441 | |
442 | === modified file 'lib/vendor/CI/Extractor/LussumoVanilla1.php' |
443 | --- lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-23 10:13:43 +0000 |
444 | +++ lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-24 16:15:42 +0000 |
445 | @@ -15,13 +15,18 @@ |
446 | * |
447 | * NEXT : make table prefix configurable |
448 | */ |
449 | - protected function getResources($dsn_source) |
450 | + protected function getResources($dsn_source, $since = null) |
451 | { |
452 | $q = 'select c.CommentID, c.Body, c.DateCreated, c.AuthUserID, c.DiscussionID, d.Name as DiscussionName, u.Name |
453 | from LUM_Comment c |
454 | inner join LUM_User u on c.AuthUserID = u.UserID |
455 | inner join LUM_Discussion d on c.DiscussionID = d.DiscussionID |
456 | where c.Deleted != 1 and c.WhisperUserID = 0'; |
457 | + if ($since) |
458 | + { |
459 | + $q .= sprintf(' and c.DateCreated > "%s"', $since); |
460 | + } |
461 | + |
462 | |
463 | return $this->getConnection($dsn_source)->fetchAssoc($q); |
464 | } |
465 | @@ -74,9 +79,13 @@ |
466 | * |
467 | * @return int |
468 | */ |
469 | - public function countResources($dsn) |
470 | + public function countResources($dsn, $since = null) |
471 | { |
472 | $q = 'select count(c.CommentID) from LUM_Comment c where c.Deleted != 1 and c.WhisperUserID = 0'; |
473 | + if ($since) |
474 | + { |
475 | + $q .= sprintf(' where c.DateCreated > "%s"', $since); |
476 | + } |
477 | |
478 | return (int)$this->getConnection($dsn)->fetchOne($q); |
479 | } |