Merge lp:~tristan-rivoallan/vanilla-miner/vm-599375 into lp:vanilla-miner

Proposed by Tristan Rivoallan
Status: Merged
Merged at revision: 56
Proposed branch: lp:~tristan-rivoallan/vanilla-miner/vm-599375
Merge into: lp:vanilla-miner
Diff against target: 394 lines (+181/-157)
5 files modified
config/doctrine/schema.yml (+3/-1)
lib/migration/doctrine/1277735444_version5.php (+21/-0)
lib/task/minerExtractlinksTask.class.php (+155/-154)
lib/vendor/CI/Extractor.php (+1/-1)
lib/vendor/CI/Extractor/LussumoVanilla1.php (+1/-1)
To merge this branch: bzr merge lp:~tristan-rivoallan/vanilla-miner/vm-599375
Reviewer Review Type Date Requested Status
Tristan Rivoallan Approve
Review via email: mp+28770@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Tristan Rivoallan (tristan-rivoallan) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'config/doctrine/schema.yml'
2--- config/doctrine/schema.yml 2010-06-24 15:56:59 +0000
3+++ config/doctrine/schema.yml 2010-06-29 16:24:30 +0000
4@@ -67,5 +67,7 @@
5 type: timestamp
6 resources_parsed:
7 type: int
8+ default: 0
9 urls_extracted:
10- type: int
11\ No newline at end of file
12+ type: int
13+ default: 0
14\ No newline at end of file
15
16=== added file 'lib/migration/doctrine/1277735444_version5.php'
17--- lib/migration/doctrine/1277735444_version5.php 1970-01-01 00:00:00 +0000
18+++ lib/migration/doctrine/1277735444_version5.php 2010-06-29 16:24:30 +0000
19@@ -0,0 +1,21 @@
20+<?php
21+/**
22+ * This class has been auto-generated by the Doctrine ORM Framework
23+ */
24+class Version5 extends Doctrine_Migration_Base
25+{
26+ public function up()
27+ {
28+ $this->changeColumn('extraction_log', 'resources_parsed', 'int', '', array(
29+ 'default' => '0',
30+ ));
31+ $this->changeColumn('extraction_log', 'urls_extracted', 'int', '', array(
32+ 'default' => '0',
33+ ));
34+ }
35+
36+ public function down()
37+ {
38+
39+ }
40+}
41\ No newline at end of file
42
43=== modified file 'lib/task/minerExtractlinksTask.class.php'
44--- lib/task/minerExtractlinksTask.class.php 2010-06-24 15:56:59 +0000
45+++ lib/task/minerExtractlinksTask.class.php 2010-06-29 16:24:30 +0000
46@@ -6,167 +6,168 @@
47 */
48 class minerExtractlinksTask extends sfBaseTask
49 {
50- /**
51- * Configures task.
52- */
53- protected function configure()
54- {
55- $this->addArguments(array(
56- new sfCommandArgument('dsn', sfCommandArgument::REQUIRED),
57- ));
58-
59- // TODO add a --verbose switch
60- $this->addOptions(array(
61- new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'),
62- new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'),
63- new sfCommandOption('extraction-driver', null, sfCommandOption::PARAMETER_REQUIRED, 'Extraction driver class name', 'CI_Extractor_LussumoVanilla1'),
64- new sfCommandOption('incremental', null, sfCommandOption::PARAMETER_REQUIRED, 'If true, only extracts URLs from new and updated resources since last extraction', true),
65- new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Displays a progress bar'),
66- ));
67-
68- $this->namespace = 'miner';
69- $this->name = 'extract-links';
70- $this->briefDescription = 'Extracts links from datasource';
71- $this->detailedDescription = <<<EOF
72+ /**
73+ * Configures task.
74+ */
75+ protected function configure()
76+ {
77+ $this->addArguments(array(
78+ new sfCommandArgument('dsn', sfCommandArgument::REQUIRED),
79+ ));
80+
81+ // TODO add a --verbose switch
82+ $this->addOptions(array(
83+ new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'),
84+ new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'),
85+ new sfCommandOption('extraction-driver', null, sfCommandOption::PARAMETER_REQUIRED, 'Extraction driver class name', 'CI_Extractor_LussumoVanilla1'),
86+ new sfCommandOption('incremental', null, sfCommandOption::PARAMETER_REQUIRED, 'If true, only extracts URLs from new and updated resources since last extraction', true),
87+ new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Displays a progress bar'),
88+ ));
89+
90+ $this->namespace = 'miner';
91+ $this->name = 'extract-links';
92+ $this->briefDescription = 'Extracts links from datasource';
93+ $this->detailedDescription = <<<EOF
94 Call it with:
95
96 [php symfony miner:extract-links --extraction-driver=My_Extraction_Driver|INFO]
97 EOF;
98- }
99-
100- /**
101- * Executes task.
102- *
103- * @param array $arguments
104- * @param array $options
105- */
106- protected function execute($arguments = array(), $options = array())
107- {
108- // Setup logging
109- $this->dispatcher->connect('log', array($this, 'onLog'));
110-
111- // TODO : autoload classes
112- $driver_classname_parts = explode('_', $options['extraction-driver']);
113- require sprintf('%s/vendor/CI/Extractor.php', sfConfig::get('sf_lib_dir'));
114- require sprintf('%s/vendor/CI/Extractor/%s.php', sfConfig::get('sf_lib_dir'), array_pop($driver_classname_parts));
115-
116- // Sanity checks
117- if (!class_exists($options['extraction-driver']))
118- {
119- throw new InvalidArgumentException(sprintf('Class "%s" does not exist', $options['extraction-driver']));
120- }
121-
122- if ($options['incremental'] === 'false')
123- {
124- $options['incremental'] = false;
125- }
126-
127- // Instanciate database connection
128- $databaseManager = new sfDatabaseManager($this->configuration);
129- $connection = $databaseManager->getDatabase($options['connection'])->getConnection();
130-
131- // If extraction is incremental, retrieve last extraction date
132- $since = null;
133- if ($options['incremental'])
134- {
135- $since = $this->getLastExtractionDate($connection);
136- if ($since)
137- {
138- $this->logSection('extract', sprintf('Incrementally extracting URLs from resources created or updated since "%s"', $since));
139- }
140- else
141- {
142- $this->logSection('extract', 'Found no entries in extraction log. Extracting URLs from all resources.');
143- }
144- }
145-
146- // Create new extraction log entry
147- $log_entry = new ExtractionLog();
148- $log_entry->extraction_driver = $options['extraction-driver'];
149- $log_entry->started_on = date('Y-m-d H:i:s');
150- $log_entry->save();
151-
152- // Instanciate and configure extraction engine
153- $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration, $since);
154-
155- // Extraction statistics
156- $urls_found_count = 0;
157- $resources_parsed = 0;
158- $resources_total = $extractor->countResources($arguments['dsn']);
159-
160- if ($resources_total > 0)
161- {
162- // Instanciate an configure progress bar
163- if ($options['progress'])
164- {
165- include 'Console/ProgressBar.php';
166- $progress_bar = new Console_ProgressBar(
167+ }
168+
169+ /**
170+ * Executes task.
171+ *
172+ * @param array $arguments
173+ * @param array $options
174+ */
175+ protected function execute($arguments = array(), $options = array())
176+ {
177+ // Setup logging
178+ $this->dispatcher->connect('log', array($this, 'onLog'));
179+
180+ // TODO : autoload classes
181+ $driver_classname_parts = explode('_', $options['extraction-driver']);
182+ require sprintf('%s/vendor/CI/Extractor.php', sfConfig::get('sf_lib_dir'));
183+ require sprintf('%s/vendor/CI/Extractor/%s.php', sfConfig::get('sf_lib_dir'), array_pop($driver_classname_parts));
184+
185+ // Sanity checks
186+ if (!class_exists($options['extraction-driver']))
187+ {
188+ throw new InvalidArgumentException(sprintf('Class "%s" does not exist', $options['extraction-driver']));
189+ }
190+
191+ if ($options['incremental'] === 'false')
192+ {
193+ $options['incremental'] = false;
194+ }
195+
196+ // Instanciate database connection
197+ $databaseManager = new sfDatabaseManager($this->configuration);
198+ $connection = $databaseManager->getDatabase($options['connection'])->getConnection();
199+
200+ // If extraction is incremental, retrieve last extraction date
201+ $since = null;
202+ if ($options['incremental'])
203+ {
204+ $since = $this->getLastExtractionDate($connection);
205+ if ($since)
206+ {
207+ $this->logSection('extract', sprintf('Incrementally extracting URLs from resources created or updated since "%s"', $since));
208+ }
209+ else
210+ {
211+ $this->logSection('extract', 'Found no entries in extraction log. Extracting URLs from all resources.');
212+ }
213+ }
214+
215+ // Create new extraction log entry
216+ $log_entry = new ExtractionLog();
217+ $log_entry->extraction_driver = $options['extraction-driver'];
218+ $log_entry->started_on = date('Y-m-d H:i:s');
219+ $log_entry->save();
220+
221+ // Instanciate and configure extraction engine
222+ $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration, $since);
223+
224+ // Extraction statistics
225+ $urls_found_count = 0;
226+ $resources_total = $extractor->countResources($arguments['dsn'], $since);
227+
228+ if ($resources_total > 0)
229+ {
230+ // Instanciate an configure progress bar
231+ if ($options['progress'])
232+ {
233+ include 'Console/ProgressBar.php';
234+ $progress_bar = new Console_ProgressBar(
235 '** '.$arguments['dsn'].' %fraction% resources [%bar%] %percent% | ',
236 '=>', '-', 80, $resources_total, array('ansi_terminal' => true)
237- );
238- }
239-
240- // Extract resources from source and insert them in Links database
241- while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'], $since))
242- {
243- // Update extraction statistics
244- $urls_found_count += $resource_extraction_info['urls_found_count'];
245-
246- // Update extraction log
247- $log_entry->resources_parsed = $resource_extraction_info['resources_parsed_count'];
248- $log_entry->urls_extracted = $urls_found_count;
249+ );
250+ }
251+
252+ // Extract resources from source and insert them in Links database
253+ while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'], $since))
254+ {
255+ // Update extraction statistics
256+ $urls_found_count += $resource_extraction_info['urls_found_count'];
257+
258+ // Update extraction log
259+ $log_entry->resources_parsed = $resource_extraction_info['resources_parsed_count'];
260+ $log_entry->urls_extracted = $urls_found_count;
261+ $log_entry->save();
262+
263+ // Update progress bar
264+ if ($options['progress'])
265+ {
266+ $progress_bar->update($resource_extraction_info['resources_parsed_count']);
267+ }
268+ }
269+
270+ // Log
271+ $this->logSection('extract', sprintf('%d URLs were extracted from %d resources', $urls_found_count, $resources_total));
272+ }
273+ else
274+ {
275+ $this->logSection('extract', 'No resources to extract. Exiting.');
276+ }
277+
278+ // Record finish time and statistics
279+ $databaseManager = new sfDatabaseManager($this->configuration);
280+ $connection = $databaseManager->getDatabase($options['connection'])->getConnection();
281+ $log_entry->finished_on = date('Y-m-d H:i:s');
282 $log_entry->save();
283-
284- // Update progress bar
285- if ($options['progress'])
286+ }
287+
288+ /**
289+ * Returns date of most recent extraction.
290+ *
291+ * @param string $doctrine_connection
292+ */
293+ private function getLastExtractionDate()
294+ {
295+ // Retrieve last extraction date
296+ $last_extraction_date = Doctrine_Query::create()
297+ ->select('l.started_on')
298+ ->from('ExtractionLog l')
299+ ->orderBy('l.started_on desc')
300+ ->limit(1)
301+ ->execute(null, Doctrine_Core::HYDRATE_SINGLE_SCALAR);
302+
303+ if (!$last_extraction_date)
304 {
305- $progress_bar->update($resource_extraction_info['resources_parsed_count']);
306+ $last_extraction_date = null;
307 }
308- }
309-
310- // Log
311- $this->logSection('extract', sprintf('%d URLs where extracted from %d resources', $urls_found_count, $resources_total));
312- }
313- else
314- {
315- $this->logSection('extract', 'No resources to extract. Exiting.');
316- }
317-
318- // Record finish time
319- $log_entry->finished_on = date('Y-m-d H:i:s');
320- $log_entry->save();
321- }
322-
323- /**
324- * Returns date of most recent extraction.
325- *
326- * @param string $doctrine_connection
327- */
328- private function getLastExtractionDate()
329- {
330- // Retrieve last extraction date
331- $last_extraction_date = Doctrine_Query::create()
332- ->select('l.started_on')
333- ->from('ExtractionLog l')
334- ->orderBy('l.started_on desc')
335- ->limit(1)
336- ->execute(null, Doctrine_Core::HYDRATE_SINGLE_SCALAR);
337-
338- if (!$last_extraction_date)
339- {
340- $last_extraction_date = null;
341- }
342-
343- return $last_extraction_date;
344- }
345-
346- /**
347- * Listens for "log" events and logs messages to stdout.
348- *
349- * @param sfEvent $event
350- */
351- public function onLog(sfEvent $event)
352- {
353- $this->logSection('extract', $event['message']);
354- }
355+
356+ return $last_extraction_date;
357+ }
358+
359+ /**
360+ * Listens for "log" events and logs messages to stdout.
361+ *
362+ * @param sfEvent $event
363+ */
364+ public function onLog(sfEvent $event)
365+ {
366+ $this->logSection('extract', $event['message']);
367+ }
368 }
369
370=== modified file 'lib/vendor/CI/Extractor.php'
371--- lib/vendor/CI/Extractor.php 2010-06-24 15:56:59 +0000
372+++ lib/vendor/CI/Extractor.php 2010-06-29 16:24:30 +0000
373@@ -117,7 +117,7 @@
374 *
375 * @param string $dsn_source
376 * @param string $connection_dest
377- * @param string $since Extract urls from resources updated or create since this date (Y-m-d H:i:s)
378+ * @param string $since Extract urls from resources updated or created since this date (Y-m-d H:i:s)
379 */
380 public function extract($dsn_source, $connection_dest, $since = null)
381 {
382
383=== modified file 'lib/vendor/CI/Extractor/LussumoVanilla1.php'
384--- lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-24 15:56:59 +0000
385+++ lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-29 16:24:30 +0000
386@@ -84,7 +84,7 @@
387 $q = 'select count(c.CommentID) from LUM_Comment c where c.Deleted != 1 and c.WhisperUserID = 0';
388 if ($since)
389 {
390- $q .= sprintf(' where c.DateCreated > "%s"', $since);
391+ $q .= sprintf(' and c.DateCreated > "%s"', $since);
392 }
393
394 return (int)$this->getConnection($dsn)->fetchOne($q);

Subscribers

People subscribed via source and target branches

to all changes: