Merge lp:~tristan-rivoallan/vanilla-miner/vm-599375 into lp:vanilla-miner
- vm-599375
- Merge into trunk
Proposed by
Tristan Rivoallan
Status: | Merged |
---|---|
Merged at revision: | 56 |
Proposed branch: | lp:~tristan-rivoallan/vanilla-miner/vm-599375 |
Merge into: | lp:vanilla-miner |
Diff against target: |
394 lines (+181/-157) 5 files modified
config/doctrine/schema.yml (+3/-1) lib/migration/doctrine/1277735444_version5.php (+21/-0) lib/task/minerExtractlinksTask.class.php (+155/-154) lib/vendor/CI/Extractor.php (+1/-1) lib/vendor/CI/Extractor/LussumoVanilla1.php (+1/-1) |
To merge this branch: | bzr merge lp:~tristan-rivoallan/vanilla-miner/vm-599375 |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Tristan Rivoallan | Approve | ||
Review via email:
|
Commit message
Description of the change
To post a comment you must log in.
Revision history for this message
![](/+icing/build/overlay/assets/skins/sam/images/close.gif)
Tristan Rivoallan (tristan-rivoallan) : | # |
review:
Approve
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'config/doctrine/schema.yml' |
2 | --- config/doctrine/schema.yml 2010-06-24 15:56:59 +0000 |
3 | +++ config/doctrine/schema.yml 2010-06-29 16:24:30 +0000 |
4 | @@ -67,5 +67,7 @@ |
5 | type: timestamp |
6 | resources_parsed: |
7 | type: int |
8 | + default: 0 |
9 | urls_extracted: |
10 | - type: int |
11 | \ No newline at end of file |
12 | + type: int |
13 | + default: 0 |
14 | \ No newline at end of file |
15 | |
16 | === added file 'lib/migration/doctrine/1277735444_version5.php' |
17 | --- lib/migration/doctrine/1277735444_version5.php 1970-01-01 00:00:00 +0000 |
18 | +++ lib/migration/doctrine/1277735444_version5.php 2010-06-29 16:24:30 +0000 |
19 | @@ -0,0 +1,21 @@ |
20 | +<?php |
21 | +/** |
22 | + * This class has been auto-generated by the Doctrine ORM Framework |
23 | + */ |
24 | +class Version5 extends Doctrine_Migration_Base |
25 | +{ |
26 | + public function up() |
27 | + { |
28 | + $this->changeColumn('extraction_log', 'resources_parsed', 'int', '', array( |
29 | + 'default' => '0', |
30 | + )); |
31 | + $this->changeColumn('extraction_log', 'urls_extracted', 'int', '', array( |
32 | + 'default' => '0', |
33 | + )); |
34 | + } |
35 | + |
36 | + public function down() |
37 | + { |
38 | + |
39 | + } |
40 | +} |
41 | \ No newline at end of file |
42 | |
43 | === modified file 'lib/task/minerExtractlinksTask.class.php' |
44 | --- lib/task/minerExtractlinksTask.class.php 2010-06-24 15:56:59 +0000 |
45 | +++ lib/task/minerExtractlinksTask.class.php 2010-06-29 16:24:30 +0000 |
46 | @@ -6,167 +6,168 @@ |
47 | */ |
48 | class minerExtractlinksTask extends sfBaseTask |
49 | { |
50 | - /** |
51 | - * Configures task. |
52 | - */ |
53 | - protected function configure() |
54 | - { |
55 | - $this->addArguments(array( |
56 | - new sfCommandArgument('dsn', sfCommandArgument::REQUIRED), |
57 | - )); |
58 | - |
59 | - // TODO add a --verbose switch |
60 | - $this->addOptions(array( |
61 | - new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'), |
62 | - new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'), |
63 | - new sfCommandOption('extraction-driver', null, sfCommandOption::PARAMETER_REQUIRED, 'Extraction driver class name', 'CI_Extractor_LussumoVanilla1'), |
64 | - new sfCommandOption('incremental', null, sfCommandOption::PARAMETER_REQUIRED, 'If true, only extracts URLs from new and updated resources since last extraction', true), |
65 | - new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Displays a progress bar'), |
66 | - )); |
67 | - |
68 | - $this->namespace = 'miner'; |
69 | - $this->name = 'extract-links'; |
70 | - $this->briefDescription = 'Extracts links from datasource'; |
71 | - $this->detailedDescription = <<<EOF |
72 | + /** |
73 | + * Configures task. |
74 | + */ |
75 | + protected function configure() |
76 | + { |
77 | + $this->addArguments(array( |
78 | + new sfCommandArgument('dsn', sfCommandArgument::REQUIRED), |
79 | + )); |
80 | + |
81 | + // TODO add a --verbose switch |
82 | + $this->addOptions(array( |
83 | + new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'), |
84 | + new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'), |
85 | + new sfCommandOption('extraction-driver', null, sfCommandOption::PARAMETER_REQUIRED, 'Extraction driver class name', 'CI_Extractor_LussumoVanilla1'), |
86 | + new sfCommandOption('incremental', null, sfCommandOption::PARAMETER_REQUIRED, 'If true, only extracts URLs from new and updated resources since last extraction', true), |
87 | + new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Displays a progress bar'), |
88 | + )); |
89 | + |
90 | + $this->namespace = 'miner'; |
91 | + $this->name = 'extract-links'; |
92 | + $this->briefDescription = 'Extracts links from datasource'; |
93 | + $this->detailedDescription = <<<EOF |
94 | Call it with: |
95 | |
96 | [php symfony miner:extract-links --extraction-driver=My_Extraction_Driver|INFO] |
97 | EOF; |
98 | - } |
99 | - |
100 | - /** |
101 | - * Executes task. |
102 | - * |
103 | - * @param array $arguments |
104 | - * @param array $options |
105 | - */ |
106 | - protected function execute($arguments = array(), $options = array()) |
107 | - { |
108 | - // Setup logging |
109 | - $this->dispatcher->connect('log', array($this, 'onLog')); |
110 | - |
111 | - // TODO : autoload classes |
112 | - $driver_classname_parts = explode('_', $options['extraction-driver']); |
113 | - require sprintf('%s/vendor/CI/Extractor.php', sfConfig::get('sf_lib_dir')); |
114 | - require sprintf('%s/vendor/CI/Extractor/%s.php', sfConfig::get('sf_lib_dir'), array_pop($driver_classname_parts)); |
115 | - |
116 | - // Sanity checks |
117 | - if (!class_exists($options['extraction-driver'])) |
118 | - { |
119 | - throw new InvalidArgumentException(sprintf('Class "%s" does not exist', $options['extraction-driver'])); |
120 | - } |
121 | - |
122 | - if ($options['incremental'] === 'false') |
123 | - { |
124 | - $options['incremental'] = false; |
125 | - } |
126 | - |
127 | - // Instanciate database connection |
128 | - $databaseManager = new sfDatabaseManager($this->configuration); |
129 | - $connection = $databaseManager->getDatabase($options['connection'])->getConnection(); |
130 | - |
131 | - // If extraction is incremental, retrieve last extraction date |
132 | - $since = null; |
133 | - if ($options['incremental']) |
134 | - { |
135 | - $since = $this->getLastExtractionDate($connection); |
136 | - if ($since) |
137 | - { |
138 | - $this->logSection('extract', sprintf('Incrementally extracting URLs from resources created or updated since "%s"', $since)); |
139 | - } |
140 | - else |
141 | - { |
142 | - $this->logSection('extract', 'Found no entries in extraction log. Extracting URLs from all resources.'); |
143 | - } |
144 | - } |
145 | - |
146 | - // Create new extraction log entry |
147 | - $log_entry = new ExtractionLog(); |
148 | - $log_entry->extraction_driver = $options['extraction-driver']; |
149 | - $log_entry->started_on = date('Y-m-d H:i:s'); |
150 | - $log_entry->save(); |
151 | - |
152 | - // Instanciate and configure extraction engine |
153 | - $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration, $since); |
154 | - |
155 | - // Extraction statistics |
156 | - $urls_found_count = 0; |
157 | - $resources_parsed = 0; |
158 | - $resources_total = $extractor->countResources($arguments['dsn']); |
159 | - |
160 | - if ($resources_total > 0) |
161 | - { |
162 | - // Instanciate an configure progress bar |
163 | - if ($options['progress']) |
164 | - { |
165 | - include 'Console/ProgressBar.php'; |
166 | - $progress_bar = new Console_ProgressBar( |
167 | + } |
168 | + |
169 | + /** |
170 | + * Executes task. |
171 | + * |
172 | + * @param array $arguments |
173 | + * @param array $options |
174 | + */ |
175 | + protected function execute($arguments = array(), $options = array()) |
176 | + { |
177 | + // Setup logging |
178 | + $this->dispatcher->connect('log', array($this, 'onLog')); |
179 | + |
180 | + // TODO : autoload classes |
181 | + $driver_classname_parts = explode('_', $options['extraction-driver']); |
182 | + require sprintf('%s/vendor/CI/Extractor.php', sfConfig::get('sf_lib_dir')); |
183 | + require sprintf('%s/vendor/CI/Extractor/%s.php', sfConfig::get('sf_lib_dir'), array_pop($driver_classname_parts)); |
184 | + |
185 | + // Sanity checks |
186 | + if (!class_exists($options['extraction-driver'])) |
187 | + { |
188 | + throw new InvalidArgumentException(sprintf('Class "%s" does not exist', $options['extraction-driver'])); |
189 | + } |
190 | + |
191 | + if ($options['incremental'] === 'false') |
192 | + { |
193 | + $options['incremental'] = false; |
194 | + } |
195 | + |
196 | + // Instanciate database connection |
197 | + $databaseManager = new sfDatabaseManager($this->configuration); |
198 | + $connection = $databaseManager->getDatabase($options['connection'])->getConnection(); |
199 | + |
200 | + // If extraction is incremental, retrieve last extraction date |
201 | + $since = null; |
202 | + if ($options['incremental']) |
203 | + { |
204 | + $since = $this->getLastExtractionDate($connection); |
205 | + if ($since) |
206 | + { |
207 | + $this->logSection('extract', sprintf('Incrementally extracting URLs from resources created or updated since "%s"', $since)); |
208 | + } |
209 | + else |
210 | + { |
211 | + $this->logSection('extract', 'Found no entries in extraction log. Extracting URLs from all resources.'); |
212 | + } |
213 | + } |
214 | + |
215 | + // Create new extraction log entry |
216 | + $log_entry = new ExtractionLog(); |
217 | + $log_entry->extraction_driver = $options['extraction-driver']; |
218 | + $log_entry->started_on = date('Y-m-d H:i:s'); |
219 | + $log_entry->save(); |
220 | + |
221 | + // Instanciate and configure extraction engine |
222 | + $extractor = new $options['extraction-driver']($this->dispatcher, $this->configuration, $since); |
223 | + |
224 | + // Extraction statistics |
225 | + $urls_found_count = 0; |
226 | + $resources_total = $extractor->countResources($arguments['dsn'], $since); |
227 | + |
228 | + if ($resources_total > 0) |
229 | + { |
230 | + // Instanciate an configure progress bar |
231 | + if ($options['progress']) |
232 | + { |
233 | + include 'Console/ProgressBar.php'; |
234 | + $progress_bar = new Console_ProgressBar( |
235 | '** '.$arguments['dsn'].' %fraction% resources [%bar%] %percent% | ', |
236 | '=>', '-', 80, $resources_total, array('ansi_terminal' => true) |
237 | - ); |
238 | - } |
239 | - |
240 | - // Extract resources from source and insert them in Links database |
241 | - while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'], $since)) |
242 | - { |
243 | - // Update extraction statistics |
244 | - $urls_found_count += $resource_extraction_info['urls_found_count']; |
245 | - |
246 | - // Update extraction log |
247 | - $log_entry->resources_parsed = $resource_extraction_info['resources_parsed_count']; |
248 | - $log_entry->urls_extracted = $urls_found_count; |
249 | + ); |
250 | + } |
251 | + |
252 | + // Extract resources from source and insert them in Links database |
253 | + while ($resource_extraction_info = $extractor->extract($arguments['dsn'], $options['connection'], $since)) |
254 | + { |
255 | + // Update extraction statistics |
256 | + $urls_found_count += $resource_extraction_info['urls_found_count']; |
257 | + |
258 | + // Update extraction log |
259 | + $log_entry->resources_parsed = $resource_extraction_info['resources_parsed_count']; |
260 | + $log_entry->urls_extracted = $urls_found_count; |
261 | + $log_entry->save(); |
262 | + |
263 | + // Update progress bar |
264 | + if ($options['progress']) |
265 | + { |
266 | + $progress_bar->update($resource_extraction_info['resources_parsed_count']); |
267 | + } |
268 | + } |
269 | + |
270 | + // Log |
271 | + $this->logSection('extract', sprintf('%d URLs were extracted from %d resources', $urls_found_count, $resources_total)); |
272 | + } |
273 | + else |
274 | + { |
275 | + $this->logSection('extract', 'No resources to extract. Exiting.'); |
276 | + } |
277 | + |
278 | + // Record finish time and statistics |
279 | + $databaseManager = new sfDatabaseManager($this->configuration); |
280 | + $connection = $databaseManager->getDatabase($options['connection'])->getConnection(); |
281 | + $log_entry->finished_on = date('Y-m-d H:i:s'); |
282 | $log_entry->save(); |
283 | - |
284 | - // Update progress bar |
285 | - if ($options['progress']) |
286 | + } |
287 | + |
288 | + /** |
289 | + * Returns date of most recent extraction. |
290 | + * |
291 | + * @param string $doctrine_connection |
292 | + */ |
293 | + private function getLastExtractionDate() |
294 | + { |
295 | + // Retrieve last extraction date |
296 | + $last_extraction_date = Doctrine_Query::create() |
297 | + ->select('l.started_on') |
298 | + ->from('ExtractionLog l') |
299 | + ->orderBy('l.started_on desc') |
300 | + ->limit(1) |
301 | + ->execute(null, Doctrine_Core::HYDRATE_SINGLE_SCALAR); |
302 | + |
303 | + if (!$last_extraction_date) |
304 | { |
305 | - $progress_bar->update($resource_extraction_info['resources_parsed_count']); |
306 | + $last_extraction_date = null; |
307 | } |
308 | - } |
309 | - |
310 | - // Log |
311 | - $this->logSection('extract', sprintf('%d URLs where extracted from %d resources', $urls_found_count, $resources_total)); |
312 | - } |
313 | - else |
314 | - { |
315 | - $this->logSection('extract', 'No resources to extract. Exiting.'); |
316 | - } |
317 | - |
318 | - // Record finish time |
319 | - $log_entry->finished_on = date('Y-m-d H:i:s'); |
320 | - $log_entry->save(); |
321 | - } |
322 | - |
323 | - /** |
324 | - * Returns date of most recent extraction. |
325 | - * |
326 | - * @param string $doctrine_connection |
327 | - */ |
328 | - private function getLastExtractionDate() |
329 | - { |
330 | - // Retrieve last extraction date |
331 | - $last_extraction_date = Doctrine_Query::create() |
332 | - ->select('l.started_on') |
333 | - ->from('ExtractionLog l') |
334 | - ->orderBy('l.started_on desc') |
335 | - ->limit(1) |
336 | - ->execute(null, Doctrine_Core::HYDRATE_SINGLE_SCALAR); |
337 | - |
338 | - if (!$last_extraction_date) |
339 | - { |
340 | - $last_extraction_date = null; |
341 | - } |
342 | - |
343 | - return $last_extraction_date; |
344 | - } |
345 | - |
346 | - /** |
347 | - * Listens for "log" events and logs messages to stdout. |
348 | - * |
349 | - * @param sfEvent $event |
350 | - */ |
351 | - public function onLog(sfEvent $event) |
352 | - { |
353 | - $this->logSection('extract', $event['message']); |
354 | - } |
355 | + |
356 | + return $last_extraction_date; |
357 | + } |
358 | + |
359 | + /** |
360 | + * Listens for "log" events and logs messages to stdout. |
361 | + * |
362 | + * @param sfEvent $event |
363 | + */ |
364 | + public function onLog(sfEvent $event) |
365 | + { |
366 | + $this->logSection('extract', $event['message']); |
367 | + } |
368 | } |
369 | |
370 | === modified file 'lib/vendor/CI/Extractor.php' |
371 | --- lib/vendor/CI/Extractor.php 2010-06-24 15:56:59 +0000 |
372 | +++ lib/vendor/CI/Extractor.php 2010-06-29 16:24:30 +0000 |
373 | @@ -117,7 +117,7 @@ |
374 | * |
375 | * @param string $dsn_source |
376 | * @param string $connection_dest |
377 | - * @param string $since Extract urls from resources updated or create since this date (Y-m-d H:i:s) |
378 | + * @param string $since Extract urls from resources updated or created since this date (Y-m-d H:i:s) |
379 | */ |
380 | public function extract($dsn_source, $connection_dest, $since = null) |
381 | { |
382 | |
383 | === modified file 'lib/vendor/CI/Extractor/LussumoVanilla1.php' |
384 | --- lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-24 15:56:59 +0000 |
385 | +++ lib/vendor/CI/Extractor/LussumoVanilla1.php 2010-06-29 16:24:30 +0000 |
386 | @@ -84,7 +84,7 @@ |
387 | $q = 'select count(c.CommentID) from LUM_Comment c where c.Deleted != 1 and c.WhisperUserID = 0'; |
388 | if ($since) |
389 | { |
390 | - $q .= sprintf(' where c.DateCreated > "%s"', $since); |
391 | + $q .= sprintf(' and c.DateCreated > "%s"', $since); |
392 | } |
393 | |
394 | return (int)$this->getConnection($dsn)->fetchOne($q); |