Merge lp:~tristan-rivoallan/vanilla-miner/vm-599285 into lp:vanilla-miner

Proposed by Tristan Rivoallan
Status: Merged
Merged at revision: 57
Proposed branch: lp:~tristan-rivoallan/vanilla-miner/vm-599285
Merge into: lp:vanilla-miner
Diff against target: 385 lines (+202/-167)
1 file modified
lib/task/minerExpandLinksTask.class.php (+202/-167)
To merge this branch: bzr merge lp:~tristan-rivoallan/vanilla-miner/vm-599285
Reviewer Review Type Date Requested Status
Tristan Rivoallan Approve
Review via email: mp+28771@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Tristan Rivoallan (tristan-rivoallan) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/task/minerExpandLinksTask.class.php'
2--- lib/task/minerExpandLinksTask.class.php 2010-06-23 20:45:16 +0000
3+++ lib/task/minerExpandLinksTask.class.php 2010-06-29 16:31:33 +0000
4@@ -1,178 +1,213 @@
5 <?php
6-
7+/**
8+ * Crawls links URLs in order to gather meaningful informations (availability, etc).
9+ */
10 class minerExpandLinksTask extends sfBaseTask
11 {
12- protected function configure()
13- {
14- $this->addOptions(array(
15- new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'),
16- new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'),
17- new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Display a progress bar'),
18- new sfCommandOption('verbose', null, sfCommandOption::PARAMETER_NONE, 'Display more informations about extraction process'),
19- new sfCommandOption('all', null, sfCommandOption::PARAMETER_NONE, 'Expand all links in database. By default, only new links are expanded'),
20- new sfCommandOption('with-unavailable', null, sfCommandOption::PARAMETER_NONE, 'When expanding all links (--all), also include links previously marked as unavailable'),
21- // TODO : add --older-than option
22- ));
23+ /**
24+ * Configures task.
25+ *
26+ * (non-PHPdoc)
27+ * @see vendor/symfony/lib/task/sfTask::configure()
28+ */
29+ protected function configure()
30+ {
31+ $this->addOptions(array(
32+ new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'dev'),
33+ new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'doctrine'),
34+ new sfCommandOption('progress', null, sfCommandOption::PARAMETER_NONE, 'Display a progress bar'),
35+ new sfCommandOption('verbose', null, sfCommandOption::PARAMETER_NONE, 'Display more informations about extraction process'),
36+ new sfCommandOption('all', null, sfCommandOption::PARAMETER_NONE, 'Expand all links in database. By default, only new links are expanded'),
37+ new sfCommandOption('with-unavailable', null, sfCommandOption::PARAMETER_NONE, 'When expanding all links (--all), also include links previously marked as unavailable'),
38+ // TODO : add --older-than option
39+ ));
40
41- $this->namespace = 'miner';
42- $this->name = 'expand-links';
43- // TODO : write descriptions
44- $this->briefDescription = 'Expands informations about links by crawling their URLs';
45- $this->detailedDescription = <<<EOF
46+ $this->namespace = 'miner';
47+ $this->name = 'expand-links';
48+ $this->briefDescription = 'Expands informations about links by crawling their URLs';
49+ $this->detailedDescription = <<<EOF
50
51 Use cases :
52 * Expand new urls : [php symfony miner:expand-links|INFO]
53 * Expand all urls (a word about --with-unavailable) : [php symfony miner:expand-links --all|INFO]
54 * Expand all urls, including those previously marked as unavailable : [php symfony miner:expand-links --all --with-unavailable|INFO]
55 EOF;
56- }
57-
58- protected function execute($arguments = array(), $options = array())
59- {
60- // Open database connection
61- $databaseManager = new sfDatabaseManager($this->configuration);
62- $connection = $databaseManager->getDatabase($options['connection'])->getConnection();
63-
64- // Build query for fetching links from database
65- $q = Doctrine_Query::create()
66- ->select('l.url')
67- ->from('Link l');
68- if (!$options['all'])
69- {
70- $q->where('l.expanded_at is null');
71- }
72- if (!$options['with-unavailable'])
73- {
74- $q->andWhere('l.availability != "unavailable"');
75- }
76-
77- // Fetch links from database
78- $links_count = $q->count();
79- $links = $q->execute(null, Doctrine_Core::HYDRATE_ON_DEMAND);
80- $q->free();
81- $this->logSection('info', sprintf('Expanding %s links', $links_count));
82-
83- // Instanciate progress bar, if user requested so
84- $links_expanded = 0;
85- if ($options['progress'])
86- {
87- include 'Console/ProgressBar.php';
88- $progress_bar = new Console_ProgressBar(
89- '** Links %fraction% comments [%bar%] %percent% | ',
90- '=>', '-', 80, $links_count, array('ansi_terminal' => true)
91- );
92- $progress_bar->update($links_expanded);
93- }
94-
95- // Launch a HEAD request on each link, and use data in response headers to update informations about link in database
96- // TODO : move crawling code to dedicated class. and then create miner:crawl-url task
97- require 'HTTP/Request2.php';
98- $request = new HTTP_Request2(null, HTTP_Request2::METHOD_HEAD, array('follow_redirects' => true));
99- $request->setHeader('user-agent', 'vanilla-miner/1.1 (https://launchpad.net/vanilla-miner)');
100-
101- foreach ($links as $link)
102- {
103- $link->expanded_at = time();
104- try
105- {
106- $request->setUrl($link->url);
107- $response = $request->send();
108- if (200 == $response->getStatus())
109- {
110- if ($options['progress'])
111- {
112- $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url));
113- }
114- else
115- {
116- $this->logSection('info', sprintf('[%d] %s - Updating metadata, marking as available', $response->getStatus(), $link->url));
117- }
118-
119- // Extract meaningful informations from server response
120- $header = $response->getHeader();
121- $header = $this->normalizeHeader($header);
122- $link->mime_type = $this->getMimeType($header);
123-
124- // Mark link as available
125- $link->availability = 'available';
126-
127- // Save link to database
128- $link->replace();
129- }
130- else
131- {
132- if ($options['progress'])
133- {
134- $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url));
135- }
136- else
137- {
138- $this->logSection('notice', sprintf(
139- '[%d] %s (%d %s) - Marking as unavailable',
140- $response->getStatus(),
141- $link->url,
142- $response->getStatus(),
143- $response->getReasonPhrase()
144- )
145- );
146- }
147- $link->availability = 'unavailable';
148- $link->replace();
149- }
150- }
151- catch (HTTP_Request2_Exception $e)
152- {
153- if ($options['progress'])
154- {
155- $this->log(sprintf('[ERR] %s', $link->url));
156- }
157- else
158- {
159- $this->logSection('error', sprintf('[ERR] Received exception with message "%s" for link "%s" - Marking as unavailable.', $e->getMessage(), $link->url));
160- }
161- $link->availability = 'unavailable';
162- $link->replace();
163- }
164-
165- // Update progress bar
166- if ($options['progress'])
167- {
168- $progress_bar->update(++$links_expanded);
169- }
170-
171- }
172- }
173-
174- private function normalizeHeader(array $header)
175- {
176- // Make all header names lower case
177- $header_rev = array_flip($header);
178- array_walk($header_rev, create_function('&$item, $key', 'strtolower($item);'));
179- $header = array_flip($header_rev);
180-
181- return $header;
182- }
183-
184- private function getMimeType(array $header)
185- {
186- $mime_type = null;
187-
188- if (isset($header['content-type']))
189- {
190- $mime_type = $header['content-type'];
191-
192- // Extract mime type from content-type header
193- // TODO : use a regular expression instead of this crappy flow
194- $matches = array();
195- if (strpos($header['content-type'], 'charset') !== false)
196- {
197- if (preg_match('/(.+); ?charset=.+/i', $header['content-type'], $matches))
198- {
199- $mime_type = $matches[1];
200- }
201- }
202- }
203-
204- return $mime_type;
205- }
206+ }
207+
208+ /**
209+ * Executes task.
210+ *
211+ * (non-PHPdoc)
212+ * @see vendor/symfony/lib/task/sfTask::execute()
213+ */
214+ protected function execute($arguments = array(), $options = array())
215+ {
216+ // Open database connection
217+ $databaseManager = new sfDatabaseManager($this->configuration);
218+ $connection = $databaseManager->getDatabase($options['connection'])->getConnection();
219+
220+ // Build query for fetching links from database
221+ $q = Doctrine_Query::create()
222+ ->select('l.url')
223+ ->from('Link l');
224+ if (!$options['all'])
225+ {
226+ $q->where('l.expanded_at is null');
227+ }
228+ if (!$options['with-unavailable'])
229+ {
230+ $q->andWhere('l.availability != "unavailable"');
231+ }
232+
233+ // Fetch links from database
234+ $links_count = $q->count();
235+ if ($links_count > 0)
236+ {
237+ $links = $q->execute(null, Doctrine_Core::HYDRATE_ON_DEMAND);
238+ $q->free();
239+ $this->logSection('expand', sprintf('Expanding %s links', $links_count));
240+
241+ // Instanciate progress bar, if user requested so
242+ $links_expanded = 0;
243+ if ($options['progress'])
244+ {
245+ include 'Console/ProgressBar.php';
246+ $progress_bar = new Console_ProgressBar(
247+ '** Links %fraction% comments [%bar%] %percent% | ',
248+ '=>', '-', 80, $links_count, array('ansi_terminal' => true)
249+ );
250+ $progress_bar->update($links_expanded);
251+ }
252+
253+ // Launch a HEAD request on each link, and use data in response headers to update informations about link in database
254+ // TODO : move crawling code to dedicated class. and then create miner:crawl-url task
255+ require 'HTTP/Request2.php';
256+ $request = new HTTP_Request2(null, HTTP_Request2::METHOD_HEAD, array('follow_redirects' => true));
257+ $request->setHeader('user-agent', 'vanilla-miner/1.1 (https://launchpad.net/vanilla-miner)');
258+
259+ foreach ($links as $link)
260+ {
261+ $link->expanded_at = time();
262+ try
263+ {
264+ $request->setUrl($link->url);
265+ $response = $request->send();
266+ if (200 == $response->getStatus())
267+ {
268+ if ($options['progress'])
269+ {
270+ $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url));
271+ }
272+ else
273+ {
274+ $this->logSection('expand', sprintf('[%d] %s - Updating metadata, marking as available', $response->getStatus(), $link->url));
275+ }
276+
277+ // Extract meaningful informations from server response
278+ $header = $response->getHeader();
279+ $header = $this->normalizeHeader($header);
280+ $link->mime_type = $this->getMimeType($header);
281+
282+ // Mark link as available
283+ $link->availability = 'available';
284+
285+ // Save link to database
286+ $link->replace();
287+ }
288+ else
289+ {
290+ if ($options['progress'])
291+ {
292+ $this->log(sprintf('[%d] %s', $response->getStatus(), $link->url));
293+ }
294+ else
295+ {
296+ $this->logSection('expand', sprintf(
297+ '[%d] %s (%d %s) - Marking as unavailable',
298+ $response->getStatus(),
299+ $link->url,
300+ $response->getStatus(),
301+ $response->getReasonPhrase()
302+ ),
303+ null,
304+ 'ERROR'
305+ );
306+ }
307+ $link->availability = 'unavailable';
308+ $link->replace();
309+ }
310+ }
311+ catch (HTTP_Request2_Exception $e)
312+ {
313+ if ($options['progress'])
314+ {
315+ $this->log(sprintf('[ERR] %s', $link->url));
316+ }
317+ else
318+ {
319+ $this->logSection('expand', sprintf('[ERR] Received exception with message "%s" for link "%s" - Marking as unavailable.', $e->getMessage(), $link->url), null, 'ERROR');
320+ }
321+ $link->availability = 'unavailable';
322+ $link->replace();
323+ }
324+
325+ // Update progress bar
326+ if ($options['progress'])
327+ {
328+ $progress_bar->update(++$links_expanded);
329+ }
330+ }
331+ }
332+ else
333+ {
334+ $this->logSection('expand', 'No links to expand. Exiting.');
335+ }
336+ }
337+
338+ /**
339+ * Lowercases all header names.
340+ *
341+ * @param array $header
342+ *
343+ * @return array
344+ */
345+ private function normalizeHeader(array $header)
346+ {
347+ // Make all header names lower case
348+ $header_rev = array_flip($header);
349+ array_walk($header_rev, create_function('&$item, $key', 'strtolower($item);'));
350+ $header = array_flip($header_rev);
351+
352+ return $header;
353+ }
354+
355+ /**
356+ * Extracts mime type from supplied header content-type string.
357+ *
358+ * @param array $header
359+ *
360+ * @return string
361+ */
362+ private function getMimeType(array $header)
363+ {
364+ $mime_type = null;
365+
366+ if (isset($header['content-type']))
367+ {
368+ $mime_type = $header['content-type'];
369+
370+ // Extract mime type from content-type header
371+ // TODO : use a regular expression instead of this crappy flow
372+ $matches = array();
373+ if (strpos($header['content-type'], 'charset') !== false)
374+ {
375+ if (preg_match('/(.+); ?charset=.+/i', $header['content-type'], $matches))
376+ {
377+ $mime_type = $matches[1];
378+ }
379+ }
380+ }
381+
382+ return $mime_type;
383+ }
384 }
385\ No newline at end of file

Subscribers

People subscribed via source and target branches

to all changes: