Merge lp:~zorba-coders/zorba/web_crawler_tutorial into lp:zorba

Proposed by Sorin Marian Nasoi
Status: Superseded
Proposed branch: lp:~zorba-coders/zorba/web_crawler_tutorial
Merge into: lp:zorba
Diff against target: 767 lines (+379/-278)
3 files modified
doc/zorba/link_crawler2.dox (+238/-0)
doc/zorba/web_crawler.dox (+141/-46)
scripts/link_crawler.xq (+0/-232)
To merge this branch: bzr merge lp:~zorba-coders/zorba/web_crawler_tutorial
Reviewer Review Type Date Requested Status
Nicolae Brinza Approve
Sorin Marian Nasoi Approve
Zorba Coders Pending
Review via email: mp+95038@code.launchpad.net

Commit message

Updated the web crawler tutorial.

Description of the change

Updated the web crawler tutorial.

This is in fact a copy of the branch
lp:~danielturcanu/zorba/web_crawler_tutorial
developed by Daniel Turcanu.

To post a comment you must log in.
Revision history for this message
Sorin Marian Nasoi (sorin.marian.nasoi) :
review: Approve
10480. By Sorin Marian Nasoi <email address hidden>

Corrected the link to the Zorba site.

Revision history for this message
Sorin Marian Nasoi (sorin.marian.nasoi) :
review: Approve
Revision history for this message
Nicolae Brinza (nbrinza) :
review: Approve

Unmerged revisions

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file 'doc/zorba/link_crawler2.dox'
2--- doc/zorba/link_crawler2.dox 1970-01-01 00:00:00 +0000
3+++ doc/zorba/link_crawler2.dox 2012-02-28 21:06:18 +0000
4@@ -0,0 +1,238 @@
5+/**
6+\page link_crawler2 Web Crawler example in XQuery
7+\code
8+(:
9+ : Copyright 2006-2011 The FLWOR Foundation.
10+ :
11+ : Licensed under the Apache License, Version 2.0 (the "License");
12+ : you may not use this file except in compliance with the License.
13+ : You may obtain a copy of the License at
14+ :
15+ : http://www.apache.org/licenses/LICENSE-2.0
16+ :
17+ : Unless required by applicable law or agreed to in writing, software
18+ : distributed under the License is distributed on an "AS IS" BASIS,
19+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20+ : See the License for the specific language governing permissions and
21+ : limitations under the License.
22+:)
23+
24+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
25+import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
26+import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
27+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
28+import module namespace file = "http://expath.org/ns/file";
29+
30+declare namespace ann = "http://www.zorba-xquery.com/annotations";
31+declare namespace xhtml="http://www.w3.org/1999/xhtml";
32+declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
33+declare namespace err="http://www.w3.org/2005/xqt-errors";
34+declare namespace httpsch = "http://expath.org/ns/http-client";
35+
36+declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/html/index";
37+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";
38+
39+
40+
41+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
42+declare variable $local:processed-external-links := xs:QName("processed-external-links");
43+
44+
45+declare %ann:sequential function local:create-containers()
46+{
47+ map:create($local:processed-internal-links, xs:QName("xs:string"));
48+ map:create($local:processed-external-links, xs:QName("xs:string"));
49+};
50+
51+declare %ann:sequential function local:delete-containers(){
52+ for $x in map:available-maps()
53+ return map:delete($x);
54+};
55+
56+declare function local:is-internal($x as xs:string) as xs:boolean
57+{
58+ starts-with($x, $uri-host)
59+};
60+
61+declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
62+{
63+let $sb := fn:substring-before($s1, $s2)
64+return if($sb = "") then $s1 else $sb
65+};
66+
67+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
68+{
69+ variable $absuri;
70+ try{
71+ $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
72+ }
73+ catch *
74+ {
75+ map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>,
76+ <MESSAGE>malformed</MESSAGE>,
77+ <RESULT>broken</RESULT>), $href);
78+ }
79+ $absuri
80+};
81+
82+
83+declare function local:get-media-type ($http-call as node()) as xs:string
84+{
85+ local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
86+};
87+
88+declare function local:alive($http-call as item()*) as xs:boolean
89+{
90+ if((count($http-call) ge 1) and
91+ ($http-call[1]/@status eq 200))
92+ then true() else fn:trace(false(), "alive")
93+};
94+
95+
96+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
97+{ distinct-values( for $y in ($content//*:a/string(@href),
98+ $content//*:link/string(@href),
99+ $content//*:script/string(@src),
100+ $content//*:img/string(@src),
101+ $content//*:area/string(@href)
102+ )
103+return local:get-real-link($y, $uri))
104+};
105+
106+
107+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
108+
109+ distinct-values(
110+ let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
111+ for $other-uri2 in $search//group[@nr=8]/string()
112+ return local:get-real-link($other-uri2, $uri)
113+ )
114+};
115+
116+
117+declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*)
118+{
119+ if(count($http-result) ge 1)
120+ then
121+ map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
122+ <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
123+ <RESULT>{if(local:alive($http-result)) then "Ok" else "broken"}</RESULT>), $url)
124+ else map:insert($map-name, <RESULT>broken</RESULT>, $url)
125+};
126+
127+declare %ann:sequential function local:process-link($x as xs:string, $baseUri as xs:string, $n as xs:integer) as item()*{
128+ if(local:is-internal($x))
129+ then local:process-internal-link($x, $baseUri, $n);
130+ else local:process-external-link($x, $baseUri);
131+
132+};
133+
134+declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
135+ if(not(empty(map:get($local:processed-external-links, $x))))
136+ then exit returning false();
137+ else {}
138+ fn:trace($x, "HEAD external link");
139+ map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
140+ variable $http-call:=();
141+ try{
142+ $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
143+ if((count($http-call) ge 1) and
144+ fn:not($http-call[1]/@status eq 200)) then
145+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
146+ else
147+ ();
148+ }
149+ catch * { }
150+ local:map-insert-result($local:processed-external-links, $x, $http-call);
151+};
152+
153+declare function local:tidy-options()
154+{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
155+ <tidyParam name="output-xml" value="yes" />
156+ <tidyParam name="doctype" value="omit" />
157+ <tidyParam name="quote-nbsp" value="no" />
158+ <tidyParam name="char-encoding" value="utf8" />
159+ <tidyParam name="newline" value="LF" />
160+ <tidyParam name="tidy-mark" value="no" />
161+ <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
162+ </options>
163+};
164+
165+
166+declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
167+ (: if($n=3) then exit returning (); else {} :)
168+ if(not(empty(map:get($local:processed-internal-links, $x))))
169+ then exit returning false();
170+ else {}
171+ fn:trace($x, "GET internal link");
172+ map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
173+ variable $http-call:=();
174+ try{
175+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
176+ }
177+ catch * { }
178+ if( not(local:alive($http-call)))
179+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
180+ else {}
181+
182+ if(not (local:get-media-type($http-call[1]) = "text/html"))
183+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
184+ else {}
185+ variable $string-content := string($http-call[2]);
186+ variable $content:=();
187+
188+ try{
189+ $content:=html:parse($string-content,local:tidy-options() );
190+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
191+ }
192+ catch *
193+ {
194+ map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
195+ <RESULT>broken</RESULT>), $x);
196+ try{
197+ $content:=parse-xml:parse-xml-fragment ($string-content, "");
198+ }
199+ catch *
200+ { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
201+ }
202+ variable $links :=();
203+ if(empty($content))
204+ then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
205+ else $links:=local:get-out-links-parsed($content, $x);
206+ for $l in $links
207+ return local:process-link($l, $x, $n+1);
208+};
209+
210+
211+
212+
213+declare function local:print-results() as element()*
214+{
215+ for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
216+ return <INTERNAL><LINK>{$x}</LINK>{map:get($local:processed-internal-links,$x)}</INTERNAL>,
217+ for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
218+ return <EXTERNAL><LINK>{$x}</LINK>{map:get($local:processed-external-links,$x)}</EXTERNAL>
219+};
220+
221+(:==========================================
222+===========================================:)
223+
224+variable $uri:= $top-uri;
225+
226+variable $result;
227+
228+local:create-containers();
229+local:process-link($uri, "", 1);
230+$result:=local:print-results() ;
231+
232+local:delete-containers();
233+
234+file:write(fn:resolve-uri("link_crawler_result.xml"),
235+ <result>{$result}</result>,
236+ <output:serialization-parameters>
237+ <output:indent value="yes"/>
238+ </output:serialization-parameters>)
239+
240+
241+\endcode
242+*/
243
244=== modified file 'doc/zorba/web_crawler.dox'
245--- doc/zorba/web_crawler.dox 2011-10-07 08:28:43 +0000
246+++ doc/zorba/web_crawler.dox 2012-02-28 21:06:18 +0000
247@@ -1,17 +1,23 @@
248 /**
249 \page web_crawler_tutorial Web Crawler example in XQuery
250
251-Description of a web crawler example in XQuery.
252+Description of a web crawler example in XQuery.<br/>
253+Entire script can be seen here:
254+\link link_crawler2
255+web crawler script
256+\endlink
257
258+<br/>
259 The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not.
260 This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml.
261-The complete code can be found in the test directory of the html convertor module.
262+The complete code can be found in the test directory of the html convertor module (link_crawler2.xq2).
263
264 \code
265 import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
266 import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
267 import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
268 import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
269+import module namespace file = "http://expath.org/ns/file";
270 \endcode
271
272 The internal pages are checked recursively, while the external ones are only checked for existence.
273@@ -19,8 +25,8 @@
274 Change this variable to point to your website, or a subdirectory on your website.
275
276 \code
277-declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
278-declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";
279+declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/html/index/";
280+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";
281
282 declare function local:is-internal($x as xs:string) as xs:boolean
283 {
284@@ -32,7 +38,7 @@
285 The crawling starts from the URI pointed by $top-uri.
286
287 Visited links are stored as nodes in two maps, one for internal pages and one for external pages.
288-The keys are the URIs, and the values are the strings "broken" or "clean".
289+The keys are the URIs, and the values are the strings "broken" or "clean", plus error codes if processing failed.
290 The maps are used to avoid parsing the same page twice.
291
292 \code
293@@ -55,10 +61,36 @@
294 After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed.
295 The html module uses tidy library, so we use tidy options to setup for converting from html to xml.
296 Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website.
297-You can add or remove tags to suit your website needs.
298+You can add or remove tags to suit your website needs.<br/>
299+The spaces in the url links are trimmed and normalized, and the fragment part is ignored.
300
301 \code
302-declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
303+declare variable $local:tidy-options := <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
304+ <tidyParam name="output-xml" value="yes" />
305+ <tidyParam name="doctype" value="omit" />
306+ <tidyParam name="quote-nbsp" value="no" />
307+ <tidyParam name="char-encoding" value="utf8" />
308+ <tidyParam name="newline" value="LF" />
309+ <tidyParam name="tidy-mark" value="no" />
310+ <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
311+ </options>;
312+
313+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
314+{
315+ variable $absuri;
316+ try{
317+ $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
318+ }
319+ catch *
320+ {
321+ map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>,
322+ <MESSAGE>malformed</MESSAGE>,
323+ <RESULT>broken</RESULT>), $href);
324+ }
325+ $absuri
326+};
327+
328+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
329 { distinct-values( for $y in ($content//*:a/string(@href),
330 $content//*:link/string(@href),
331 $content//*:script/string(@src),
332@@ -68,90 +100,128 @@
333 return local:get-real-link($y, $uri))
334 };
335
336-declare function local:tidy-options()
337-{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
338- <tidyParam name="output-xml" value="yes" />
339- <tidyParam name="doctype" value="omit" />
340- <tidyParam name="quote-nbsp" value="no" />
341- <tidyParam name="char-encoding" value="utf8" />
342- <tidyParam name="newline" value="LF" />
343- <tidyParam name="tidy-mark" value="no" />
344- <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
345- </options>
346+
347+declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*)
348+{
349+ if(count($http-result) ge 1)
350+ then
351+ map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
352+ <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
353+ <RESULT>{if(local:alive($http-result))
354+ then "Ok"
355+ else if(local:is-redirect($http-result))
356+ then "redirect"
357+ else "broken"
358+ }</RESULT>), $url);
359+ else map:insert($map-name, <RESULT>broken</RESULT>, $url);
360+ if(local:is-redirect($http-result)) then
361+ map:insert($map-name, <REDIRECT>{fn:string($http-result[1]/httpsch:header[@name = "Location"]/@value)}</REDIRECT>, $url);
362+ else {}
363 };
364
365-declare %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
366- if($n=3) then exit returning (); else {}
367+declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
368 if(not(empty(map:get($local:processed-internal-links, $x))))
369 then exit returning false();
370 else {}
371+ fn:trace($x, "GET internal link");
372+ map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
373 variable $http-call:=();
374 try{
375- $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
376- }
377- catch * {}
378- if( not(local:alive($http-call)))
379- then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
380- else {}
381- if(not (local:get-media-type($http-call[1]) = $supported-media-types))
382- then {map:insert($local:processed-internal-links, "clean", $x); exit returning ();}
383- else {}
384- variable $string-content := xs:string($http-call[2]);
385+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}" follow-redirect="false"/>, (), ());
386+ }
387+ catch * { }
388+ if(local:is-redirect($http-call)) then
389+ {
390+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
391+ try{
392+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
393+ }
394+ catch * { }
395+ }
396+ else {}
397+ if( not(local:alive($http-call)))
398+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
399+ else {}
400+
401+ if(not (local:get-media-type($http-call[1]) = "text/html"))
402+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
403+ else {}
404+ variable $string-content := string($http-call[2]);
405 variable $content:=();
406
407 try{
408- $content:=html:parse($string-content,local:tidy-options() );
409+ $content:=html:parse($string-content,$local:tidy-options );
410+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
411 }
412 catch *
413- {
414- map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x);
415+ {
416+ map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
417+ <RESULT>broken</RESULT>), $x);
418 try{
419 $content:=parse-xml:parse-xml-fragment ($string-content, "");
420 }
421 catch *
422- { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
423+ { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
424 }
425 variable $links :=();
426 if(empty($content))
427- then $links:=local:get-out-links-unparsed($string-content, $x);
428+ then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
429 else $links:=local:get-out-links-parsed($content, $x);
430 for $l in $links
431- return local:process-link($l, $n+1);
432+ return local:process-link($l, $x, $n+1);
433 };
434
435 \endcode
436
437+For each parsed link, we store the FROM, STATUS, MESSAGE and RESULT. The RESULT is "Ok" if everything went fine,
438+or "broken" if the page couldn't be retrieved or passed, and in this case MESSAGE contains the error message.
439+The FROM element contains the parent url for that link.<br/>
440+<br/>
441 Some html pages have errors, and tidy library is very strict with checking errors.
442 When the parsing fails, we fallback to using regex for extracting the links.
443
444 \code
445-declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
446+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
447
448 distinct-values(
449 let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
450 for $other-uri2 in $search//group[@nr=8]/string()
451- let $y:= fn:normalize-space($other-uri2)
452- return local:get-real-link($y, $uri)
453+ return local:get-real-link($other-uri2, $uri)
454 )
455 };
456
457 \endcode
458
459 For external links, we just check if they exist, so the http command requests only for HEAD.
460+Some websites return error for HEAD, in this case we revert to use GET.
461
462 \code
463-declare %ann:sequential function local:process-external-link($x as xs:string){
464+declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
465 if(not(empty(map:get($local:processed-external-links, $x))))
466 then exit returning false();
467 else {}
468- variable $http-call:=();
469+ fn:trace($x, "HEAD external link");
470+ map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
471+ variable $http-call:=();
472 try{
473- $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
474+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
475+ if((count($http-call) ge 1) and
476+ fn:not($http-call[1]/@status eq 200)) then
477+ {
478+ if(local:is-redirect($http-call)) then
479+ {
480+ local:map-insert-result($local:processed-external-links, $x, $http-call);
481+ }
482+ else {}
483+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
484+ local:map-insert-result($local:processed-external-links, $x, $http-call);
485+ }
486+ else
487+ {}
488 }
489- catch * {}
490- if( local:alive($http-call))
491- then map:insert($local:processed-external-links, "clean", $x);
492- else map:insert($local:processed-external-links, "broken", $x);
493+ catch *
494+ { $http-call:=();}
495+ local:map-insert-result($local:processed-external-links, $x, $http-call);
496 };
497
498 \endcode
499@@ -170,4 +240,29 @@
500
501 \endcode
502
503-*/
504\ No newline at end of file
505+The main program calls the recursive function local:process-link for the $top-uri.
506+
507+\code
508+(:==========================================
509+===========================================:)
510+
511+variable $uri:= $top-uri;
512+
513+variable $result;
514+
515+local:create-containers();
516+local:process-link($uri, "", 1);
517+$result:=local:print-results() ;
518+
519+local:delete-containers();
520+
521+file:write(fn:resolve-uri("link_crawler_result.xml"),
522+ <result>{$result}</result>,
523+ <output:serialization-parameters>
524+ <output:indent value="yes"/>
525+ </output:serialization-parameters>)
526+
527+\endcode
528+
529+
530+*/
531
532=== removed file 'scripts/link_crawler.xq'
533--- scripts/link_crawler.xq 2011-08-18 20:07:20 +0000
534+++ scripts/link_crawler.xq 1970-01-01 00:00:00 +0000
535@@ -1,232 +0,0 @@
536-(:
537- : Copyright 2006-2011 The FLWOR Foundation.
538- :
539- : Licensed under the Apache License, Version 2.0 (the "License");
540- : you may not use this file except in compliance with the License.
541- : You may obtain a copy of the License at
542- :
543- : http://www.apache.org/licenses/LICENSE-2.0
544- :
545- : Unless required by applicable law or agreed to in writing, software
546- : distributed under the License is distributed on an "AS IS" BASIS,
547- : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
548- : See the License for the specific language governing permissions and
549- : limitations under the License.
550-:)
551-
552-import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
553-import module namespace file = "http://expath.org/ns/file";
554-import module namespace ddl = "http://www.zorba-xquery.com/modules/store/dynamic/collections/ddl";
555-import module namespace dml = "http://www.zorba-xquery.com/modules/store/dynamic/collections/dml";
556-import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
557-(:import module namespace lcc = "http://www.zorba-xquery.com/scripts/link-crawler-collections" at "link_crawler_collections.xq";:)
558-import module namespace tidy="http://www.zorba-xquery.com/modules/converters/html";
559-import schema namespace tidy-options="http://www.zorba-xquery.com/modules/converters/html-options";
560-import schema namespace httpsch = "http://expath.org/ns/http-client";
561-declare namespace ann = "http://www.zorba-xquery.com/annotations";
562-declare namespace xhtml="http://www.w3.org/1999/xhtml";
563-declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
564-declare namespace err="http://www.w3.org/2005/xqt-errors";
565-
566-declare variable $result-file as xs:string external; (:PROJECT_SOURCE_DIR:)
567-
568-declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html"; (: the start page :)
569-declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/"; (: what differentiates an internal uri :)
570-
571-declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity",
572- "application/atom+xml", "text/html");
573-
574-declare variable $internal-uris as xs:QName := xs:QName("internal-uris");
575-declare variable $external-uris as xs:QName := xs:QName("external-uris");
576-
577-declare variable $broken-internal as xs:QName := xs:QName("local:broken-internal");
578-declare variable $broken-external as xs:QName := xs:QName("local:broken-external");
579-declare variable $pages-cannot-parse as xs:QName := xs:QName("local:pages-cannot-parse");
580-
581-
582-declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
583-{
584- let $sb := fn:substring-before($s1, $s2)
585- return
586- if($sb = "") then
587- $s1
588- else
589- $sb
590-};
591-
592-
593-declare %ann:sequential function local:get-uris-from-page($uri as xs:string,
594- $reluri as xs:string,
595- $call-from as xs:string)
596-{
597- variable $method;
598- if(fn:starts-with($uri, $uri-host)) then
599- {
600- map:insert($internal-uris, $uri, $uri);
601- $method := "GET";
602- }
603- else
604- {
605- map:insert($external-uris, $uri, $uri);
606- $method := "HEAD";
607- }
608- fn:trace($uri, "");
609-
610- variable $load-result;
611- variable $content-string;
612- try{
613- $load-result := http:send-request(<httpsch:request method="{$method}" href="{$uri}"/>, (), ());
614- if($load-result[1]/@status eq 200) then
615- if(fn:starts-with($uri, $uri-host)) then
616- {
617- $content-string := string($load-result[2]);
618- let $media-type := local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")
619- return
620- if($media-type = "text/html") then
621- let $content := tidy:parse($content-string,
622- <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
623- <tidyParam name="output-xml" value="yes" />
624- <tidyParam name="doctype" value="omit" />
625- <tidyParam name="quote-nbsp" value="no" />
626- <tidyParam name="char-encoding" value="utf8" />
627- <tidyParam name="newline" value="LF" />
628- <tidyParam name="tidy-mark" value="no" />
629- <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
630- </options>)
631- for $other-uri2 in ($content//*:a/string(@href),
632- $content//*:link/string(@href),
633- $content//*:script/string(@src),
634- $content//*:img/string(@src),
635- $content//*:area/string(@href)
636- )
637- let $other-uri := fn:normalize-space($other-uri2)
638- let $absuri := local:my-substring-before(fn:resolve-uri($other-uri, $uri), "#")
639- return
640- if(fn:not(fn:starts-with($other-uri, "#")) and
641- fn:empty(map:get($internal-uris, $absuri)) and
642- fn:empty(map:get($external-uris, $absuri))) then
643- local:get-uris-from-page($absuri, $other-uri, $uri);
644- else (:already followed this link:)
645- ();
646- else(: it's binary :)
647- fn:trace((" has binary content ", $media-type), "");
648- }
649- else(:success loading external link:)
650- ();
651- else (: broken link :)
652- if(fn:starts-with($uri, $uri-host)) then
653- {
654- dml:insert-nodes-last($broken-internal, <internal-broken-uri>
655- <uri>{$reluri}</uri>
656- <call-from>{$call-from}</call-from>
657- <media-type>{local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")}</media-type>
658- </internal-broken-uri>);
659- }
660- else
661- {
662- dml:insert-nodes-last($broken-external, <external-broken-uri>
663- <uri>{$uri}</uri>
664- <call-from>{$call-from}</call-from>
665- <media-type>{local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")}</media-type>
666- </external-broken-uri>);
667- }
668- }catch ZXQP0003
669- {
670- dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse>
671- <uri>{$uri}</uri>
672- <reluri>{$reluri}</reluri>
673- <call-from>{$call-from}</call-from>
674- <err-code>{$err:code}</err-code>
675- <err-description>{$err:description}</err-description>
676- <err-value>{$err:value}</err-value>
677- <err-module>{$err:module}</err-module>
678- <err-line>{$err:line-number}</err-line>
679- </page-cannot-parse>);
680- try{ (: tidy failed to parse the html, use regex:)
681- let $content := $content-string
682- let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
683- for $other-uri2 in $search//fn:group[@nr=8]/fn:string()
684- let $other-uri := fn:normalize-space($other-uri2)
685- let $absuri := local:my-substring-before(fn:resolve-uri($other-uri, $uri), "#")
686- return
687- if(fn:not(fn:starts-with($other-uri, "#")) and
688- fn:empty(map:get($internal-uris, $absuri)) and
689- fn:empty(map:get($external-uris, $absuri))) then
690- local:get-uris-from-page($absuri, $other-uri, $uri);
691- else (:already followed this link:)
692- ();
693- }catch *
694- {
695- dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse>
696- <uri>{$uri}</uri>
697- <reluri>{$reluri}</reluri>
698- <call-from>{$call-from}</call-from>
699- <err-code>{$err:code}</err-code>
700- <err-description>{$err:description}</err-description>
701- <err-value>{$err:value}</err-value>
702- <err-module>{$err:module}</err-module>
703- <err-line>{$err:line-number}</err-line>
704- </page-cannot-parse>);
705- }
706- }catch *
707- {
708- dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse>
709- <uri>{$uri}</uri>
710- <reluri>{$reluri}</reluri>
711- <call-from>{$call-from}</call-from>
712- <err-code>{$err:code}</err-code>
713- <err-description>{$err:description}</err-description>
714- <err-value>{$err:value}</err-value>
715- <err-module>{$err:module}</err-module>
716- <err-line>{$err:line-number}</err-line>
717- </page-cannot-parse>);
718- }
719-};
720-
721-
722-
723-map:create($internal-uris, xs:QName("xs:string"));
724-ddl:create($broken-internal);
725-map:create($external-uris, xs:QName("xs:string"));
726-ddl:create($broken-external);
727-ddl:create($pages-cannot-parse);
728-
729-local:get-uris-from-page($top-uri, $top-uri, "");
730-
731-(:display results:)
732-let $full-report :=
733-<link-crawler website="{$top-uri}">
734-<internal-broken-uris>
735-{for $i in dml:collection($broken-internal)
736-let $u := $i/uri
737-group by $u
738-return $i}
739-</internal-broken-uris>
740-<pages-cannot-parse>
741-{for $e in dml:collection($pages-cannot-parse)
742-let $u := $e/uri
743-group by $u
744-return $e}
745-</pages-cannot-parse>
746-<external-uris>
747-{for $e in map:keys($external-uris)
748-let $v := $e/attribute/@value
749-group by $v
750-return <external-uri>{$e}</external-uri>}
751-</external-uris>
752-<external-broken-uris>
753-{for $e in dml:collection($broken-external)
754-let $u := $e/uri
755-group by $u
756-return $e}
757-</external-broken-uris>
758-
759-</link-crawler>
760-
761-return
762-file:write($result-file,
763- $full-report,
764- <output:serialization-parameters>
765- <output:indent value="yes"/>
766- </output:serialization-parameters>)
767-

Subscribers

People subscribed via source and target branches