Merge lp:~tristan-rivoallan/vanilla-miner/vm-599285 into lp:vanilla-miner
- vm-599285
- Merge into trunk
Proposed by
Tristan Rivoallan
Status: | Merged | ||||
---|---|---|---|---|---|
Merged at revision: | 57 | ||||
Proposed branch: | lp:~tristan-rivoallan/vanilla-miner/vm-599285 | ||||
Merge into: | lp:vanilla-miner | ||||
Diff against target: |
385 lines (+202/-167) 1 file modified
lib/task/minerExpandLinksTask.class.php (+202/-167) |
||||
To merge this branch: | bzr merge lp:~tristan-rivoallan/vanilla-miner/vm-599285 | ||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Tristan Rivoallan | Approve | ||
Review via email: mp+28771@code.launchpad.net |
Commit message
Description of the change
To post a comment you must log in.
Revision history for this message
Tristan Rivoallan (tristan-rivoallan) : | # |
review:
Approve
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'lib/task/minerExpandLinksTask.class.php' |
2 | --- lib/task/minerExpandLinksTask.class.php 2010-06-23 20:45:16 +0000 |
3 | +++ lib/task/minerExpandLinksTask.class.php 2010-06-29 16:31:33 +0000 |
4 | @@ -1,178 +1,213 @@ |
5 | <?php |
6 | - |
7 | +/** |
8 | + * Crawls links URLs in order to gather meaningful informations (availability, etc). |
9 | + */ |
10 | class minerExpandLinksTask extends sfBaseTask |
11 | { |
12 | - protected function configure() |
13 | - { |
14 | - $this->addOptions(array( |
15 | - new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'), |
16 | - new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'), |
17 | - new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Display a progress bar'), |
18 | - new sfCommandOption('verbose', null, sfCommandOption::PARAMETER_NONE, 'Display more informations about extraction process'), |
19 | - new sfCommandOption('all', null, sfCommandOption::PARAMETER_NONE, 'Expand all links in database. By default, only new links are expanded'), |
20 | - new sfCommandOption('with-unavailable', null, sfCommandOption::PARAMETER_NONE, 'When expanding all links (--all), also include links previously marked as unavailable'), |
21 | - // TODO : add --older-than option |
22 | - )); |
23 | + /** |
24 | + * Configures task. |
25 | + * |
26 | + * (non-PHPdoc) |
27 | + * @see vendor/symfony/lib/task/sfTask::configure() |
28 | + */ |
29 | + protected function configure() |
30 | + { |
31 | + $this->addOptions(array( |
32 | + new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'), |
33 | + new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'), |
34 | + new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Display a progress bar'), |
35 | + new sfCommandOption('verbose', null, sfCommandOption::PARAMETER_NONE, 'Display more informations about extraction process'), |
36 | + new sfCommandOption('all', null, sfCommandOption::PARAMETER_NONE, 'Expand all links in database. By default, only new links are expanded'), |
37 | + new sfCommandOption('with-unavailable', null, sfCommandOption::PARAMETER_NONE, 'When expanding all links (--all), also include links previously marked as unavailable'), |
38 | + // TODO : add --older-than option |
39 | + )); |
40 | |
41 | - $this->namespace = 'miner'; |
42 | - $this->name = 'expand-links'; |
43 | - // TODO : write descriptions |
44 | - $this->briefDescription = 'Expands informations about links by crawling their URLs'; |
45 | - $this->detailedDescription = <<<EOF |
46 | + $this->namespace = 'miner'; |
47 | + $this->name = 'expand-links'; |
48 | + $this->briefDescription = 'Expands informations about links by crawling their URLs'; |
49 | + $this->detailedDescription = <<<EOF |
50 | |
51 | Use cases : |
52 | * Expand new urls : [php symfony miner:expand-links|INFO] |
53 | * Expand all urls (a word about --with-unavailable) : [php symfony miner:expand-links --all|INFO] |
54 | * Expand all urls, including those previously marked as unavailable : [php symfony miner:expand-links --all --with-unavailable|INFO] |
55 | EOF; |
56 | - } |
57 | - |
58 | - protected function execute($arguments = array(), $options = array()) |
59 | - { |
60 | - // Open database connection |
61 | - $databaseManager = new sfDatabaseManager($this->configuration); |
62 | - $connection = $databaseManager->getDatabase($options['connection'])->getConnection(); |
63 | - |
64 | - // Build query for fetching links from database |
65 | - $q = Doctrine_Query::create() |
66 | - ->select('l.url') |
67 | - ->from('Link l'); |
68 | - if (!$options['all']) |
69 | - { |
70 | - $q->where('l.expanded_at is null'); |
71 | - } |
72 | - if (!$options['with-unavailable']) |
73 | - { |
74 | - $q->andWhere('l.availability != "unavailable"'); |
75 | - } |
76 | - |
77 | - // Fetch links from database |
78 | - $links_count = $q->count(); |
79 | - $links = $q->execute(null, Doctrine_Core::HYDRATE_ON_DEMAND); |
80 | - $q->free(); |
81 | - $this->logSection('info', sprintf('Expanding %s links', $links_count)); |
82 | - |
83 | - // Instanciate progress bar, if user requested so |
84 | - $links_expanded = 0; |
85 | - if ($options['progress']) |
86 | - { |
87 | - include 'Console/ProgressBar.php'; |
88 | - $progress_bar = new Console_ProgressBar( |
89 | - '** Links %fraction% comments [%bar%] %percent% | ', |
90 | - '=>', '-', 80, $links_count, array('ansi_terminal' => true) |
91 | - ); |
92 | - $progress_bar->update($links_expanded); |
93 | - } |
94 | - |
95 | - // Launch a HEAD request on each link, and use data in response headers to update informations about link in database |
96 | - // TODO : move crawling code to dedicated class. and then create miner:crawl-url task |
97 | - require 'HTTP/Request2.php'; |
98 | - $request = new HTTP_Request2(null, HTTP_Request2::METHOD_HEAD, array('follow_redirects' => true)); |
99 | - $request->setHeader('user-agent', 'vanilla-miner/1.1 (https://launchpad.net/vanilla-miner)'); |
100 | - |
101 | - foreach ($links as $link) |
102 | - { |
103 | - $link->expanded_at = time(); |
104 | - try |
105 | - { |
106 | - $request->setUrl($link->url); |
107 | - $response = $request->send(); |
108 | - if (200 == $response->getStatus()) |
109 | - { |
110 | - if ($options['progress']) |
111 | - { |
112 | - $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url)); |
113 | - } |
114 | - else |
115 | - { |
116 | - $this->logSection('info', sprintf('[%d] %s - Updating metadata, marking as available', $response->getStatus(), $link->url)); |
117 | - } |
118 | - |
119 | - // Extract meaningful informations from server response |
120 | - $header = $response->getHeader(); |
121 | - $header = $this->normalizeHeader($header); |
122 | - $link->mime_type = $this->getMimeType($header); |
123 | - |
124 | - // Mark link as available |
125 | - $link->availability = 'available'; |
126 | - |
127 | - // Save link to database |
128 | - $link->replace(); |
129 | - } |
130 | - else |
131 | - { |
132 | - if ($options['progress']) |
133 | - { |
134 | - $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url)); |
135 | - } |
136 | - else |
137 | - { |
138 | - $this->logSection('notice', sprintf( |
139 | - '[%d] %s (%d %s) - Marking as unavailable', |
140 | - $response->getStatus(), |
141 | - $link->url, |
142 | - $response->getStatus(), |
143 | - $response->getReasonPhrase() |
144 | - ) |
145 | - ); |
146 | - } |
147 | - $link->availability = 'unavailable'; |
148 | - $link->replace(); |
149 | - } |
150 | - } |
151 | - catch (HTTP_Request2_Exception $e) |
152 | - { |
153 | - if ($options['progress']) |
154 | - { |
155 | - $this->log(sprintf('[ERR] %s', $link->url)); |
156 | - } |
157 | - else |
158 | - { |
159 | - $this->logSection('error', sprintf('[ERR] Received exception with message "%s" for link "%s" - Marking as unavailable.', $e->getMessage(), $link->url)); |
160 | - } |
161 | - $link->availability = 'unavailable'; |
162 | - $link->replace(); |
163 | - } |
164 | - |
165 | - // Update progress bar |
166 | - if ($options['progress']) |
167 | - { |
168 | - $progress_bar->update(++$links_expanded); |
169 | - } |
170 | - |
171 | - } |
172 | - } |
173 | - |
174 | - private function normalizeHeader(array $header) |
175 | - { |
176 | - // Make all header names lower case |
177 | - $header_rev = array_flip($header); |
178 | - array_walk($header_rev, create_function('&$item, $key', 'strtolower($item);')); |
179 | - $header = array_flip($header_rev); |
180 | - |
181 | - return $header; |
182 | - } |
183 | - |
184 | - private function getMimeType(array $header) |
185 | - { |
186 | - $mime_type = null; |
187 | - |
188 | - if (isset($header['content-type'])) |
189 | - { |
190 | - $mime_type = $header['content-type']; |
191 | - |
192 | - // Extract mime type from content-type header |
193 | - // TODO : use a regular expression instead of this crappy flow |
194 | - $matches = array(); |
195 | - if (strpos($header['content-type'], 'charset') !== false) |
196 | - { |
197 | - if (preg_match('/(.+); ?charset=.+/i', $header['content-type'], $matches)) |
198 | - { |
199 | - $mime_type = $matches[1]; |
200 | - } |
201 | - } |
202 | - } |
203 | - |
204 | - return $mime_type; |
205 | - } |
206 | + } |
207 | + |
208 | + /** |
209 | + * Executes task. |
210 | + * |
211 | + * (non-PHPdoc) |
212 | + * @see vendor/symfony/lib/task/sfTask::execute() |
213 | + */ |
214 | + protected function execute($arguments = array(), $options = array()) |
215 | + { |
216 | + // Open database connection |
217 | + $databaseManager = new sfDatabaseManager($this->configuration); |
218 | + $connection = $databaseManager->getDatabase($options['connection'])->getConnection(); |
219 | + |
220 | + // Build query for fetching links from database |
221 | + $q = Doctrine_Query::create() |
222 | + ->select('l.url') |
223 | + ->from('Link l'); |
224 | + if (!$options['all']) |
225 | + { |
226 | + $q->where('l.expanded_at is null'); |
227 | + } |
228 | + if (!$options['with-unavailable']) |
229 | + { |
230 | + $q->andWhere('l.availability != "unavailable"'); |
231 | + } |
232 | + |
233 | + // Fetch links from database |
234 | + $links_count = $q->count(); |
235 | + if ($links_count > 0) |
236 | + { |
237 | + $links = $q->execute(null, Doctrine_Core::HYDRATE_ON_DEMAND); |
238 | + $q->free(); |
239 | + $this->logSection('expand', sprintf('Expanding %s links', $links_count)); |
240 | + |
241 | + // Instanciate progress bar, if user requested so |
242 | + $links_expanded = 0; |
243 | + if ($options['progress']) |
244 | + { |
245 | + include 'Console/ProgressBar.php'; |
246 | + $progress_bar = new Console_ProgressBar( |
247 | + '** Links %fraction% comments [%bar%] %percent% | ', |
248 | + '=>', '-', 80, $links_count, array('ansi_terminal' => true) |
249 | + ); |
250 | + $progress_bar->update($links_expanded); |
251 | + } |
252 | + |
253 | + // Launch a HEAD request on each link, and use data in response headers to update informations about link in database |
254 | + // TODO : move crawling code to dedicated class. and then create miner:crawl-url task |
255 | + require 'HTTP/Request2.php'; |
256 | + $request = new HTTP_Request2(null, HTTP_Request2::METHOD_HEAD, array('follow_redirects' => true)); |
257 | + $request->setHeader('user-agent', 'vanilla-miner/1.1 (https://launchpad.net/vanilla-miner)'); |
258 | + |
259 | + foreach ($links as $link) |
260 | + { |
261 | + $link->expanded_at = time(); |
262 | + try |
263 | + { |
264 | + $request->setUrl($link->url); |
265 | + $response = $request->send(); |
266 | + if (200 == $response->getStatus()) |
267 | + { |
268 | + if ($options['progress']) |
269 | + { |
270 | + $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url)); |
271 | + } |
272 | + else |
273 | + { |
274 | + $this->logSection('expand', sprintf('[%d] %s - Updating metadata, marking as available', $response->getStatus(), $link->url)); |
275 | + } |
276 | + |
277 | + // Extract meaningful informations from server response |
278 | + $header = $response->getHeader(); |
279 | + $header = $this->normalizeHeader($header); |
280 | + $link->mime_type = $this->getMimeType($header); |
281 | + |
282 | + // Mark link as available |
283 | + $link->availability = 'available'; |
284 | + |
285 | + // Save link to database |
286 | + $link->replace(); |
287 | + } |
288 | + else |
289 | + { |
290 | + if ($options['progress']) |
291 | + { |
292 | + $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url)); |
293 | + } |
294 | + else |
295 | + { |
296 | + $this->logSection('expand', sprintf( |
297 | + '[%d] %s (%d %s) - Marking as unavailable', |
298 | + $response->getStatus(), |
299 | + $link->url, |
300 | + $response->getStatus(), |
301 | + $response->getReasonPhrase() |
302 | + ), |
303 | + null, |
304 | + 'ERROR' |
305 | + ); |
306 | + } |
307 | + $link->availability = 'unavailable'; |
308 | + $link->replace(); |
309 | + } |
310 | + } |
311 | + catch (HTTP_Request2_Exception $e) |
312 | + { |
313 | + if ($options['progress']) |
314 | + { |
315 | + $this->log(sprintf('[ERR] %s', $link->url)); |
316 | + } |
317 | + else |
318 | + { |
319 | + $this->logSection('expand', sprintf('[ERR] Received exception with message "%s" for link "%s" - Marking as unavailable.', $e->getMessage(), $link->url), null, 'ERROR'); |
320 | + } |
321 | + $link->availability = 'unavailable'; |
322 | + $link->replace(); |
323 | + } |
324 | + |
325 | + // Update progress bar |
326 | + if ($options['progress']) |
327 | + { |
328 | + $progress_bar->update(++$links_expanded); |
329 | + } |
330 | + } |
331 | + } |
332 | + else |
333 | + { |
334 | + $this->logSection('expand', 'No links to expand. Exiting.'); |
335 | + } |
336 | + } |
337 | + |
338 | + /** |
339 | + * Lowercases all header names. |
340 | + * |
341 | + * @param array $header |
342 | + * |
343 | + * @return array |
344 | + */ |
345 | + private function normalizeHeader(array $header) |
346 | + { |
347 | + // Make all header names lower case |
348 | + $header_rev = array_flip($header); |
349 | + array_walk($header_rev, create_function('&$item, $key', 'strtolower($item);')); |
350 | + $header = array_flip($header_rev); |
351 | + |
352 | + return $header; |
353 | + } |
354 | + |
355 | + /** |
356 | + * Extracts mime type from supplied header content-type string. |
357 | + * |
358 | + * @param array $header |
359 | + * |
360 | + * @return string |
361 | + */ |
362 | + private function getMimeType(array $header) |
363 | + { |
364 | + $mime_type = null; |
365 | + |
366 | + if (isset($header['content-type'])) |
367 | + { |
368 | + $mime_type = $header['content-type']; |
369 | + |
370 | + // Extract mime type from content-type header |
371 | + // TODO : use a regular expression instead of this crappy flow |
372 | + $matches = array(); |
373 | + if (strpos($header['content-type'], 'charset') !== false) |
374 | + { |
375 | + if (preg_match('/(.+); ?charset=.+/i', $header['content-type'], $matches)) |
376 | + { |
377 | + $mime_type = $matches[1]; |
378 | + } |
379 | + } |
380 | + } |
381 | + |
382 | + return $mime_type; |
383 | + } |
384 | } |
385 | \ No newline at end of file |