Merge lp:~zorba-coders/zorba/web_crawler_tutorial into lp:zorba
- web_crawler_tutorial
- Merge into trunk
Proposed by
Sorin Marian Nasoi
Status: | Merged |
---|---|
Approved by: | Sorin Marian Nasoi |
Approved revision: | 10480 |
Merged at revision: | 10687 |
Proposed branch: | lp:~zorba-coders/zorba/web_crawler_tutorial |
Merge into: | lp:zorba |
Diff against target: |
767 lines (+379/-278) 3 files modified
doc/zorba/link_crawler2.dox (+238/-0) doc/zorba/web_crawler.dox (+141/-46) scripts/link_crawler.xq (+0/-232) |
To merge this branch: | bzr merge lp:~zorba-coders/zorba/web_crawler_tutorial |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Nicolae Brinza | Approve | ||
Sorin Marian Nasoi | Approve | ||
Zorba Coders | Pending | ||
Review via email: mp+95049@code.launchpad.net |
Commit message
Updated the web crawler tutorial.
Description of the change
Updated the web crawler tutorial.
This is in fact a copy of the branch
lp:~danielturcanu/zorba/web_crawler_tutorial
developed by Daniel Turcanu.
To post a comment you must log in.
Revision history for this message
Sorin Marian Nasoi (sorin.marian.nasoi) : | # |
review:
Approve
Revision history for this message
Nicolae Brinza (nbrinza) : | # |
review:
Approve
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : | # |
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue job web_crawler_
All tests succeeded!
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === added file 'doc/zorba/link_crawler2.dox' |
2 | --- doc/zorba/link_crawler2.dox 1970-01-01 00:00:00 +0000 |
3 | +++ doc/zorba/link_crawler2.dox 2012-02-28 21:09:28 +0000 |
4 | @@ -0,0 +1,238 @@ |
5 | +/** |
6 | +\page link_crawler2 Web Crawler example in XQuery |
7 | +\code |
8 | +(: |
9 | + : Copyright 2006-2011 The FLWOR Foundation. |
10 | + : |
11 | + : Licensed under the Apache License, Version 2.0 (the "License"); |
12 | + : you may not use this file except in compliance with the License. |
13 | + : You may obtain a copy of the License at |
14 | + : |
15 | + : http://www.apache.org/licenses/LICENSE-2.0 |
16 | + : |
17 | + : Unless required by applicable law or agreed to in writing, software |
18 | + : distributed under the License is distributed on an "AS IS" BASIS, |
19 | + : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
20 | + : See the License for the specific language governing permissions and |
21 | + : limitations under the License. |
22 | +:) |
23 | + |
24 | +import module namespace http = "http://www.zorba-xquery.com/modules/http-client"; |
25 | +import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map"; |
26 | +import module namespace html = "http://www.zorba-xquery.com/modules/converters/html"; |
27 | +import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml"; |
28 | +import module namespace file = "http://expath.org/ns/file"; |
29 | + |
30 | +declare namespace ann = "http://www.zorba-xquery.com/annotations"; |
31 | +declare namespace xhtml="http://www.w3.org/1999/xhtml"; |
32 | +declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization"; |
33 | +declare namespace err="http://www.w3.org/2005/xqt-errors"; |
34 | +declare namespace httpsch = "http://expath.org/ns/http-client"; |
35 | + |
36 | +declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/html/index"; |
37 | +declare variable $uri-host as xs:string := "http://www.zorba-xquery.com"; |
38 | + |
39 | + |
40 | + |
41 | +declare variable $local:processed-internal-links := xs:QName("processed-internal-links"); |
42 | +declare variable $local:processed-external-links := xs:QName("processed-external-links"); |
43 | + |
44 | + |
45 | +declare %ann:sequential function local:create-containers() |
46 | +{ |
47 | + map:create($local:processed-internal-links, xs:QName("xs:string")); |
48 | + map:create($local:processed-external-links, xs:QName("xs:string")); |
49 | +}; |
50 | + |
51 | +declare %ann:sequential function local:delete-containers(){ |
52 | + for $x in map:available-maps() |
53 | + return map:delete($x); |
54 | +}; |
55 | + |
56 | +declare function local:is-internal($x as xs:string) as xs:boolean |
57 | +{ |
58 | + starts-with($x, $uri-host) |
59 | +}; |
60 | + |
61 | +declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string |
62 | +{ |
63 | +let $sb := fn:substring-before($s1, $s2) |
64 | +return if($sb = "") then $s1 else $sb |
65 | +}; |
66 | + |
67 | +declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string? |
68 | +{ |
69 | + variable $absuri; |
70 | + try{ |
71 | + $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#"); |
72 | + } |
73 | + catch * |
74 | + { |
75 | + map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>, |
76 | + <MESSAGE>malformed</MESSAGE>, |
77 | + <RESULT>broken</RESULT>), $href); |
78 | + } |
79 | + $absuri |
80 | +}; |
81 | + |
82 | + |
83 | +declare function local:get-media-type ($http-call as node()) as xs:string |
84 | +{ |
85 | + local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";") |
86 | +}; |
87 | + |
88 | +declare function local:alive($http-call as item()*) as xs:boolean |
89 | +{ |
90 | + if((count($http-call) ge 1) and |
91 | + ($http-call[1]/@status eq 200)) |
92 | + then true() else fn:trace(false(), "alive") |
93 | +}; |
94 | + |
95 | + |
96 | +declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string* |
97 | +{ distinct-values( for $y in ($content//*:a/string(@href), |
98 | + $content//*:link/string(@href), |
99 | + $content//*:script/string(@src), |
100 | + $content//*:img/string(@src), |
101 | + $content//*:area/string(@href) |
102 | + ) |
103 | +return local:get-real-link($y, $uri)) |
104 | +}; |
105 | + |
106 | + |
107 | +declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{ |
108 | + |
109 | + distinct-values( |
110 | + let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7") |
111 | + for $other-uri2 in $search//group[@nr=8]/string() |
112 | + return local:get-real-link($other-uri2, $uri) |
113 | + ) |
114 | +}; |
115 | + |
116 | + |
117 | +declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*) |
118 | +{ |
119 | + if(count($http-result) ge 1) |
120 | + then |
121 | + map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>, |
122 | + <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>, |
123 | + <RESULT>{if(local:alive($http-result)) then "Ok" else "broken"}</RESULT>), $url) |
124 | + else map:insert($map-name, <RESULT>broken</RESULT>, $url) |
125 | +}; |
126 | + |
127 | +declare %ann:sequential function local:process-link($x as xs:string, $baseUri as xs:string, $n as xs:integer) as item()*{ |
128 | + if(local:is-internal($x)) |
129 | + then local:process-internal-link($x, $baseUri, $n); |
130 | + else local:process-external-link($x, $baseUri); |
131 | + |
132 | +}; |
133 | + |
134 | +declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){ |
135 | + if(not(empty(map:get($local:processed-external-links, $x)))) |
136 | + then exit returning false(); |
137 | + else {} |
138 | + fn:trace($x, "HEAD external link"); |
139 | + map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x); |
140 | + variable $http-call:=(); |
141 | + try{ |
142 | + $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ()); |
143 | + if((count($http-call) ge 1) and |
144 | + fn:not($http-call[1]/@status eq 200)) then |
145 | + $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); |
146 | + else |
147 | + (); |
148 | + } |
149 | + catch * { } |
150 | + local:map-insert-result($local:processed-external-links, $x, $http-call); |
151 | +}; |
152 | + |
153 | +declare function local:tidy-options() |
154 | +{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" > |
155 | + <tidyParam name="output-xml" value="yes" /> |
156 | + <tidyParam name="doctype" value="omit" /> |
157 | + <tidyParam name="quote-nbsp" value="no" /> |
158 | + <tidyParam name="char-encoding" value="utf8" /> |
159 | + <tidyParam name="newline" value="LF" /> |
160 | + <tidyParam name="tidy-mark" value="no" /> |
161 | + <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" /> |
162 | + </options> |
163 | +}; |
164 | + |
165 | + |
166 | +declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){ |
167 | + (: if($n=3) then exit returning (); else {} :) |
168 | + if(not(empty(map:get($local:processed-internal-links, $x)))) |
169 | + then exit returning false(); |
170 | + else {} |
171 | + fn:trace($x, "GET internal link"); |
172 | + map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x); |
173 | + variable $http-call:=(); |
174 | + try{ |
175 | + $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); |
176 | + } |
177 | + catch * { } |
178 | + if( not(local:alive($http-call))) |
179 | + then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();} |
180 | + else {} |
181 | + |
182 | + if(not (local:get-media-type($http-call[1]) = "text/html")) |
183 | + then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();} |
184 | + else {} |
185 | + variable $string-content := string($http-call[2]); |
186 | + variable $content:=(); |
187 | + |
188 | + try{ |
189 | + $content:=html:parse($string-content,local:tidy-options() ); |
190 | + local:map-insert-result($local:processed-internal-links, $x, $http-call); |
191 | + } |
192 | + catch * |
193 | + { |
194 | + map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>, |
195 | + <RESULT>broken</RESULT>), $x); |
196 | + try{ |
197 | + $content:=parse-xml:parse-xml-fragment ($string-content, ""); |
198 | + } |
199 | + catch * |
200 | + { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);} |
201 | + } |
202 | + variable $links :=(); |
203 | + if(empty($content)) |
204 | + then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed")); |
205 | + else $links:=local:get-out-links-parsed($content, $x); |
206 | + for $l in $links |
207 | + return local:process-link($l, $x, $n+1); |
208 | +}; |
209 | + |
210 | + |
211 | + |
212 | + |
213 | +declare function local:print-results() as element()* |
214 | +{ |
215 | + for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string() |
216 | + return <INTERNAL><LINK>{$x}</LINK>{map:get($local:processed-internal-links,$x)}</INTERNAL>, |
217 | + for $x in map:keys($local:processed-external-links)/map:attribute/@value/string() |
218 | + return <EXTERNAL><LINK>{$x}</LINK>{map:get($local:processed-external-links,$x)}</EXTERNAL> |
219 | +}; |
220 | + |
221 | +(:========================================== |
222 | +===========================================:) |
223 | + |
224 | +variable $uri:= $top-uri; |
225 | + |
226 | +variable $result; |
227 | + |
228 | +local:create-containers(); |
229 | +local:process-link($uri, "", 1); |
230 | +$result:=local:print-results() ; |
231 | + |
232 | +local:delete-containers(); |
233 | + |
234 | +file:write(fn:resolve-uri("link_crawler_result.xml"), |
235 | + <result>{$result}</result>, |
236 | + <output:serialization-parameters> |
237 | + <output:indent value="yes"/> |
238 | + </output:serialization-parameters>) |
239 | + |
240 | + |
241 | +\endcode |
242 | +*/ |
243 | |
244 | === modified file 'doc/zorba/web_crawler.dox' |
245 | --- doc/zorba/web_crawler.dox 2011-10-07 08:28:43 +0000 |
246 | +++ doc/zorba/web_crawler.dox 2012-02-28 21:09:28 +0000 |
247 | @@ -1,17 +1,23 @@ |
248 | /** |
249 | \page web_crawler_tutorial Web Crawler example in XQuery |
250 | |
251 | -Description of a web crawler example in XQuery. |
252 | +Description of a web crawler example in XQuery.<br/> |
253 | +Entire script can be seen here: |
254 | +\link link_crawler2 |
255 | +web crawler script |
256 | +\endlink |
257 | |
258 | +<br/> |
259 | The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not. |
260 | This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml. |
261 | -The complete code can be found in the test directory of the html convertor module. |
262 | +The complete code can be found in the test directory of the html convertor module (link_crawler2.xq2). |
263 | |
264 | \code |
265 | import module namespace http = "http://www.zorba-xquery.com/modules/http-client"; |
266 | import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map"; |
267 | import module namespace html = "http://www.zorba-xquery.com/modules/converters/html"; |
268 | import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml"; |
269 | +import module namespace file = "http://expath.org/ns/file"; |
270 | \endcode |
271 | |
272 | The internal pages are checked recursively, while the external ones are only checked for existence. |
273 | @@ -19,8 +25,8 @@ |
274 | Change this variable to point to your website, or a subdirectory on your website. |
275 | |
276 | \code |
277 | -declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html"; |
278 | -declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/"; |
279 | +declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/html/index/"; |
280 | +declare variable $uri-host as xs:string := "http://www.zorba-xquery.com"; |
281 | |
282 | declare function local:is-internal($x as xs:string) as xs:boolean |
283 | { |
284 | @@ -32,7 +38,7 @@ |
285 | The crawling starts from the URI pointed by $top-uri. |
286 | |
287 | Visited links are stored as nodes in two maps, one for internal pages and one for external pages. |
288 | -The keys are the URIs, and the values are the strings "broken" or "clean". |
289 | +The keys are the URIs, and the values are the strings "broken" or "clean", plus error codes if processing failed. |
290 | The maps are used to avoid parsing the same page twice. |
291 | |
292 | \code |
293 | @@ -55,10 +61,36 @@ |
294 | After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed. |
295 | The html module uses tidy library, so we use tidy options to setup for converting from html to xml. |
296 | Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website. |
297 | -You can add or remove tags to suit your website needs. |
298 | +You can add or remove tags to suit your website needs.<br/> |
299 | +The spaces in the url links are trimmed and normalized, and the fragment part is ignored. |
300 | |
301 | \code |
302 | -declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string* |
303 | +declare variable $local:tidy-options := <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" > |
304 | + <tidyParam name="output-xml" value="yes" /> |
305 | + <tidyParam name="doctype" value="omit" /> |
306 | + <tidyParam name="quote-nbsp" value="no" /> |
307 | + <tidyParam name="char-encoding" value="utf8" /> |
308 | + <tidyParam name="newline" value="LF" /> |
309 | + <tidyParam name="tidy-mark" value="no" /> |
310 | + <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" /> |
311 | + </options>; |
312 | + |
313 | +declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string? |
314 | +{ |
315 | + variable $absuri; |
316 | + try{ |
317 | + $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#"); |
318 | + } |
319 | + catch * |
320 | + { |
321 | + map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>, |
322 | + <MESSAGE>malformed</MESSAGE>, |
323 | + <RESULT>broken</RESULT>), $href); |
324 | + } |
325 | + $absuri |
326 | +}; |
327 | + |
328 | +declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string* |
329 | { distinct-values( for $y in ($content//*:a/string(@href), |
330 | $content//*:link/string(@href), |
331 | $content//*:script/string(@src), |
332 | @@ -68,90 +100,128 @@ |
333 | return local:get-real-link($y, $uri)) |
334 | }; |
335 | |
336 | -declare function local:tidy-options() |
337 | -{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" > |
338 | - <tidyParam name="output-xml" value="yes" /> |
339 | - <tidyParam name="doctype" value="omit" /> |
340 | - <tidyParam name="quote-nbsp" value="no" /> |
341 | - <tidyParam name="char-encoding" value="utf8" /> |
342 | - <tidyParam name="newline" value="LF" /> |
343 | - <tidyParam name="tidy-mark" value="no" /> |
344 | - <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" /> |
345 | - </options> |
346 | + |
347 | +declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*) |
348 | +{ |
349 | + if(count($http-result) ge 1) |
350 | + then |
351 | + map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>, |
352 | + <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>, |
353 | + <RESULT>{if(local:alive($http-result)) |
354 | + then "Ok" |
355 | + else if(local:is-redirect($http-result)) |
356 | + then "redirect" |
357 | + else "broken" |
358 | + }</RESULT>), $url); |
359 | + else map:insert($map-name, <RESULT>broken</RESULT>, $url); |
360 | + if(local:is-redirect($http-result)) then |
361 | + map:insert($map-name, <REDIRECT>{fn:string($http-result[1]/httpsch:header[@name = "Location"]/@value)}</REDIRECT>, $url); |
362 | + else {} |
363 | }; |
364 | |
365 | -declare %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){ |
366 | - if($n=3) then exit returning (); else {} |
367 | +declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){ |
368 | if(not(empty(map:get($local:processed-internal-links, $x)))) |
369 | then exit returning false(); |
370 | else {} |
371 | + fn:trace($x, "GET internal link"); |
372 | + map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x); |
373 | variable $http-call:=(); |
374 | try{ |
375 | - $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); |
376 | - } |
377 | - catch * {} |
378 | - if( not(local:alive($http-call))) |
379 | - then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();} |
380 | - else {} |
381 | - if(not (local:get-media-type($http-call[1]) = $supported-media-types)) |
382 | - then {map:insert($local:processed-internal-links, "clean", $x); exit returning ();} |
383 | - else {} |
384 | - variable $string-content := xs:string($http-call[2]); |
385 | + $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}" follow-redirect="false"/>, (), ()); |
386 | + } |
387 | + catch * { } |
388 | + if(local:is-redirect($http-call)) then |
389 | + { |
390 | + local:map-insert-result($local:processed-internal-links, $x, $http-call); |
391 | + try{ |
392 | + $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); |
393 | + } |
394 | + catch * { } |
395 | + } |
396 | + else {} |
397 | + if( not(local:alive($http-call))) |
398 | + then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();} |
399 | + else {} |
400 | + |
401 | + if(not (local:get-media-type($http-call[1]) = "text/html")) |
402 | + then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();} |
403 | + else {} |
404 | + variable $string-content := string($http-call[2]); |
405 | variable $content:=(); |
406 | |
407 | try{ |
408 | - $content:=html:parse($string-content,local:tidy-options() ); |
409 | + $content:=html:parse($string-content,$local:tidy-options ); |
410 | + local:map-insert-result($local:processed-internal-links, $x, $http-call); |
411 | } |
412 | catch * |
413 | - { |
414 | - map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); |
415 | + { |
416 | + map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>, |
417 | + <RESULT>broken</RESULT>), $x); |
418 | try{ |
419 | $content:=parse-xml:parse-xml-fragment ($string-content, ""); |
420 | } |
421 | catch * |
422 | - { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);} |
423 | + { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);} |
424 | } |
425 | variable $links :=(); |
426 | if(empty($content)) |
427 | - then $links:=local:get-out-links-unparsed($string-content, $x); |
428 | + then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed")); |
429 | else $links:=local:get-out-links-parsed($content, $x); |
430 | for $l in $links |
431 | - return local:process-link($l, $n+1); |
432 | + return local:process-link($l, $x, $n+1); |
433 | }; |
434 | |
435 | \endcode |
436 | |
437 | +For each parsed link, we store the FROM, STATUS, MESSAGE and RESULT. The RESULT is "Ok" if everything went fine, |
438 | +or "broken" if the page couldn't be retrieved or passed, and in this case MESSAGE contains the error message. |
439 | +The FROM element contains the parent url for that link.<br/> |
440 | +<br/> |
441 | Some html pages have errors, and tidy library is very strict with checking errors. |
442 | When the parsing fails, we fallback to using regex for extracting the links. |
443 | |
444 | \code |
445 | -declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{ |
446 | +declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{ |
447 | |
448 | distinct-values( |
449 | let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7") |
450 | for $other-uri2 in $search//group[@nr=8]/string() |
451 | - let $y:= fn:normalize-space($other-uri2) |
452 | - return local:get-real-link($y, $uri) |
453 | + return local:get-real-link($other-uri2, $uri) |
454 | ) |
455 | }; |
456 | |
457 | \endcode |
458 | |
459 | For external links, we just check if they exist, so the http command requests only for HEAD. |
460 | +Some websites return error for HEAD, in this case we revert to use GET. |
461 | |
462 | \code |
463 | -declare %ann:sequential function local:process-external-link($x as xs:string){ |
464 | +declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){ |
465 | if(not(empty(map:get($local:processed-external-links, $x)))) |
466 | then exit returning false(); |
467 | else {} |
468 | - variable $http-call:=(); |
469 | + fn:trace($x, "HEAD external link"); |
470 | + map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x); |
471 | + variable $http-call:=(); |
472 | try{ |
473 | - $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ()); |
474 | + $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); |
475 | + if((count($http-call) ge 1) and |
476 | + fn:not($http-call[1]/@status eq 200)) then |
477 | + { |
478 | + if(local:is-redirect($http-call)) then |
479 | + { |
480 | + local:map-insert-result($local:processed-external-links, $x, $http-call); |
481 | + } |
482 | + else {} |
483 | + $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); |
484 | + local:map-insert-result($local:processed-external-links, $x, $http-call); |
485 | + } |
486 | + else |
487 | + {} |
488 | } |
489 | - catch * {} |
490 | - if( local:alive($http-call)) |
491 | - then map:insert($local:processed-external-links, "clean", $x); |
492 | - else map:insert($local:processed-external-links, "broken", $x); |
493 | + catch * |
494 | + { $http-call:=();} |
495 | + local:map-insert-result($local:processed-external-links, $x, $http-call); |
496 | }; |
497 | |
498 | \endcode |
499 | @@ -170,4 +240,29 @@ |
500 | |
501 | \endcode |
502 | |
503 | -*/ |
504 | \ No newline at end of file |
505 | +The main program calls the recursive function local:process-link for the $top-uri. |
506 | + |
507 | +\code |
508 | +(:========================================== |
509 | +===========================================:) |
510 | + |
511 | +variable $uri:= $top-uri; |
512 | + |
513 | +variable $result; |
514 | + |
515 | +local:create-containers(); |
516 | +local:process-link($uri, "", 1); |
517 | +$result:=local:print-results() ; |
518 | + |
519 | +local:delete-containers(); |
520 | + |
521 | +file:write(fn:resolve-uri("link_crawler_result.xml"), |
522 | + <result>{$result}</result>, |
523 | + <output:serialization-parameters> |
524 | + <output:indent value="yes"/> |
525 | + </output:serialization-parameters>) |
526 | + |
527 | +\endcode |
528 | + |
529 | + |
530 | +*/ |
531 | |
532 | === removed file 'scripts/link_crawler.xq' |
533 | --- scripts/link_crawler.xq 2011-08-18 20:07:20 +0000 |
534 | +++ scripts/link_crawler.xq 1970-01-01 00:00:00 +0000 |
535 | @@ -1,232 +0,0 @@ |
536 | -(: |
537 | - : Copyright 2006-2011 The FLWOR Foundation. |
538 | - : |
539 | - : Licensed under the Apache License, Version 2.0 (the "License"); |
540 | - : you may not use this file except in compliance with the License. |
541 | - : You may obtain a copy of the License at |
542 | - : |
543 | - : http://www.apache.org/licenses/LICENSE-2.0 |
544 | - : |
545 | - : Unless required by applicable law or agreed to in writing, software |
546 | - : distributed under the License is distributed on an "AS IS" BASIS, |
547 | - : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
548 | - : See the License for the specific language governing permissions and |
549 | - : limitations under the License. |
550 | -:) |
551 | - |
552 | -import module namespace http = "http://www.zorba-xquery.com/modules/http-client"; |
553 | -import module namespace file = "http://expath.org/ns/file"; |
554 | -import module namespace ddl = "http://www.zorba-xquery.com/modules/store/dynamic/collections/ddl"; |
555 | -import module namespace dml = "http://www.zorba-xquery.com/modules/store/dynamic/collections/dml"; |
556 | -import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map"; |
557 | -(:import module namespace lcc = "http://www.zorba-xquery.com/scripts/link-crawler-collections" at "link_crawler_collections.xq";:) |
558 | -import module namespace tidy="http://www.zorba-xquery.com/modules/converters/html"; |
559 | -import schema namespace tidy-options="http://www.zorba-xquery.com/modules/converters/html-options"; |
560 | -import schema namespace httpsch = "http://expath.org/ns/http-client"; |
561 | -declare namespace ann = "http://www.zorba-xquery.com/annotations"; |
562 | -declare namespace xhtml="http://www.w3.org/1999/xhtml"; |
563 | -declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization"; |
564 | -declare namespace err="http://www.w3.org/2005/xqt-errors"; |
565 | - |
566 | -declare variable $result-file as xs:string external; (:PROJECT_SOURCE_DIR:) |
567 | - |
568 | -declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html"; (: the start page :) |
569 | -declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/"; (: what differentiates an internal uri :) |
570 | - |
571 | -declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity", |
572 | - "application/atom+xml", "text/html"); |
573 | - |
574 | -declare variable $internal-uris as xs:QName := xs:QName("internal-uris"); |
575 | -declare variable $external-uris as xs:QName := xs:QName("external-uris"); |
576 | - |
577 | -declare variable $broken-internal as xs:QName := xs:QName("local:broken-internal"); |
578 | -declare variable $broken-external as xs:QName := xs:QName("local:broken-external"); |
579 | -declare variable $pages-cannot-parse as xs:QName := xs:QName("local:pages-cannot-parse"); |
580 | - |
581 | - |
582 | -declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string |
583 | -{ |
584 | - let $sb := fn:substring-before($s1, $s2) |
585 | - return |
586 | - if($sb = "") then |
587 | - $s1 |
588 | - else |
589 | - $sb |
590 | -}; |
591 | - |
592 | - |
593 | -declare %ann:sequential function local:get-uris-from-page($uri as xs:string, |
594 | - $reluri as xs:string, |
595 | - $call-from as xs:string) |
596 | -{ |
597 | - variable $method; |
598 | - if(fn:starts-with($uri, $uri-host)) then |
599 | - { |
600 | - map:insert($internal-uris, $uri, $uri); |
601 | - $method := "GET"; |
602 | - } |
603 | - else |
604 | - { |
605 | - map:insert($external-uris, $uri, $uri); |
606 | - $method := "HEAD"; |
607 | - } |
608 | - fn:trace($uri, ""); |
609 | - |
610 | - variable $load-result; |
611 | - variable $content-string; |
612 | - try{ |
613 | - $load-result := http:send-request(<httpsch:request method="{$method}" href="{$uri}"/>, (), ()); |
614 | - if($load-result[1]/@status eq 200) then |
615 | - if(fn:starts-with($uri, $uri-host)) then |
616 | - { |
617 | - $content-string := string($load-result[2]); |
618 | - let $media-type := local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";") |
619 | - return |
620 | - if($media-type = "text/html") then |
621 | - let $content := tidy:parse($content-string, |
622 | - <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" > |
623 | - <tidyParam name="output-xml" value="yes" /> |
624 | - <tidyParam name="doctype" value="omit" /> |
625 | - <tidyParam name="quote-nbsp" value="no" /> |
626 | - <tidyParam name="char-encoding" value="utf8" /> |
627 | - <tidyParam name="newline" value="LF" /> |
628 | - <tidyParam name="tidy-mark" value="no" /> |
629 | - <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" /> |
630 | - </options>) |
631 | - for $other-uri2 in ($content//*:a/string(@href), |
632 | - $content//*:link/string(@href), |
633 | - $content//*:script/string(@src), |
634 | - $content//*:img/string(@src), |
635 | - $content//*:area/string(@href) |
636 | - ) |
637 | - let $other-uri := fn:normalize-space($other-uri2) |
638 | - let $absuri := local:my-substring-before(fn:resolve-uri($other-uri, $uri), "#") |
639 | - return |
640 | - if(fn:not(fn:starts-with($other-uri, "#")) and |
641 | - fn:empty(map:get($internal-uris, $absuri)) and |
642 | - fn:empty(map:get($external-uris, $absuri))) then |
643 | - local:get-uris-from-page($absuri, $other-uri, $uri); |
644 | - else (:already followed this link:) |
645 | - (); |
646 | - else(: it's binary :) |
647 | - fn:trace((" has binary content ", $media-type), ""); |
648 | - } |
649 | - else(:success loading external link:) |
650 | - (); |
651 | - else (: broken link :) |
652 | - if(fn:starts-with($uri, $uri-host)) then |
653 | - { |
654 | - dml:insert-nodes-last($broken-internal, <internal-broken-uri> |
655 | - <uri>{$reluri}</uri> |
656 | - <call-from>{$call-from}</call-from> |
657 | - <media-type>{local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")}</media-type> |
658 | - </internal-broken-uri>); |
659 | - } |
660 | - else |
661 | - { |
662 | - dml:insert-nodes-last($broken-external, <external-broken-uri> |
663 | - <uri>{$uri}</uri> |
664 | - <call-from>{$call-from}</call-from> |
665 | - <media-type>{local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")}</media-type> |
666 | - </external-broken-uri>); |
667 | - } |
668 | - }catch ZXQP0003 |
669 | - { |
670 | - dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse> |
671 | - <uri>{$uri}</uri> |
672 | - <reluri>{$reluri}</reluri> |
673 | - <call-from>{$call-from}</call-from> |
674 | - <err-code>{$err:code}</err-code> |
675 | - <err-description>{$err:description}</err-description> |
676 | - <err-value>{$err:value}</err-value> |
677 | - <err-module>{$err:module}</err-module> |
678 | - <err-line>{$err:line-number}</err-line> |
679 | - </page-cannot-parse>); |
680 | - try{ (: tidy failed to parse the html, use regex:) |
681 | - let $content := $content-string |
682 | - let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7") |
683 | - for $other-uri2 in $search//fn:group[@nr=8]/fn:string() |
684 | - let $other-uri := fn:normalize-space($other-uri2) |
685 | - let $absuri := local:my-substring-before(fn:resolve-uri($other-uri, $uri), "#") |
686 | - return |
687 | - if(fn:not(fn:starts-with($other-uri, "#")) and |
688 | - fn:empty(map:get($internal-uris, $absuri)) and |
689 | - fn:empty(map:get($external-uris, $absuri))) then |
690 | - local:get-uris-from-page($absuri, $other-uri, $uri); |
691 | - else (:already followed this link:) |
692 | - (); |
693 | - }catch * |
694 | - { |
695 | - dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse> |
696 | - <uri>{$uri}</uri> |
697 | - <reluri>{$reluri}</reluri> |
698 | - <call-from>{$call-from}</call-from> |
699 | - <err-code>{$err:code}</err-code> |
700 | - <err-description>{$err:description}</err-description> |
701 | - <err-value>{$err:value}</err-value> |
702 | - <err-module>{$err:module}</err-module> |
703 | - <err-line>{$err:line-number}</err-line> |
704 | - </page-cannot-parse>); |
705 | - } |
706 | - }catch * |
707 | - { |
708 | - dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse> |
709 | - <uri>{$uri}</uri> |
710 | - <reluri>{$reluri}</reluri> |
711 | - <call-from>{$call-from}</call-from> |
712 | - <err-code>{$err:code}</err-code> |
713 | - <err-description>{$err:description}</err-description> |
714 | - <err-value>{$err:value}</err-value> |
715 | - <err-module>{$err:module}</err-module> |
716 | - <err-line>{$err:line-number}</err-line> |
717 | - </page-cannot-parse>); |
718 | - } |
719 | -}; |
720 | - |
721 | - |
722 | - |
723 | -map:create($internal-uris, xs:QName("xs:string")); |
724 | -ddl:create($broken-internal); |
725 | -map:create($external-uris, xs:QName("xs:string")); |
726 | -ddl:create($broken-external); |
727 | -ddl:create($pages-cannot-parse); |
728 | - |
729 | -local:get-uris-from-page($top-uri, $top-uri, ""); |
730 | - |
731 | -(:display results:) |
732 | -let $full-report := |
733 | -<link-crawler website="{$top-uri}"> |
734 | -<internal-broken-uris> |
735 | -{for $i in dml:collection($broken-internal) |
736 | -let $u := $i/uri |
737 | -group by $u |
738 | -return $i} |
739 | -</internal-broken-uris> |
740 | -<pages-cannot-parse> |
741 | -{for $e in dml:collection($pages-cannot-parse) |
742 | -let $u := $e/uri |
743 | -group by $u |
744 | -return $e} |
745 | -</pages-cannot-parse> |
746 | -<external-uris> |
747 | -{for $e in map:keys($external-uris) |
748 | -let $v := $e/attribute/@value |
749 | -group by $v |
750 | -return <external-uri>{$e}</external-uri>} |
751 | -</external-uris> |
752 | -<external-broken-uris> |
753 | -{for $e in dml:collection($broken-external) |
754 | -let $u := $e/uri |
755 | -group by $u |
756 | -return $e} |
757 | -</external-broken-uris> |
758 | - |
759 | -</link-crawler> |
760 | - |
761 | -return |
762 | -file:write($result-file, |
763 | - $full-report, |
764 | - <output:serialization-parameters> |
765 | - <output:indent value="yes"/> |
766 | - </output:serialization-parameters>) |
767 | - |
Validation queue starting for merge proposal. zorbatest. lambda. nu:8080/ remotequeue/ web_crawler_ tutorial- 2012-02- 29T15-12- 04.445Z/ log.html
Log at: http://