1
=== modified file 'doc/zorba/indexpage.dox.in'
2
--- doc/zorba/indexpage.dox.in	2011-09-06 16:39:46 +0000
3
+++ doc/zorba/indexpage.dox.in	2011-09-27 15:05:56 +0000
4
@@ -127,6 +127,14 @@
5
127
    <!--li>\ref extensions_update</li-->
127
    <!--li>\ref extensions_update</li-->
6
128
128
7
129
129
8
130
</td></tr>
9
131
<tr><td class="tdDocIndexTable">
10
132
11
133
12
134
    <h2>Tutorials</h2>
13
135
14
136
    \ref web_crawler_tutorial
15
137
16
130
</td><tr>
138
</td><tr>
17
131
</table>
139
</table>
18
132
140
19
133
141
20
=== added file 'doc/zorba/web_crawler.dox'
21
--- doc/zorba/web_crawler.dox	1970-01-01 00:00:00 +0000
22
+++ doc/zorba/web_crawler.dox	2011-09-27 15:05:56 +0000
23
@@ -0,0 +1,173 @@
24
1
/**
25
2
\page web_crawler_tutorial  Web Crawler example in XQuery
26
3
27
4
Description of a web crawler example in XQuery.
28
5
29
6
The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not.
30
7
This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml.
31
8
The complete code can be found in the test directory of the html convertor module.
32
9
33
10
\code
34
11
import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
35
12
import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
36
13
import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
37
14
import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
38
15
\endcode
39
16
40
17
The internal pages are checked recursively, while the external ones are only checked for existence.
41
18
The distinction between internal and external links is made by comparing the URI with a global string variable $uri-host.
42
19
Change this variable to point to your website, or a subdirectory on your website.
43
20
44
21
\code
45
22
declare variable $top-uri  as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
46
23
declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";
47
24
48
25
declare function local:is-internal($x as xs:string) as xs:boolean
49
26
{
50
27
 starts-with($x, $uri-host)
51
28
};
52
29
53
30
\endcode
54
31
55
32
The crawling starts from the URI pointed by $top-uri.
56
33
57
34
Visited links are stored as nodes in two maps, one for internal pages and one for external pages.
58
35
The keys are the URIs, and the values are the strings "broken" or "clean".
59
36
The maps are used to avoid parsing the same page twice.
60
37
61
38
\code
62
39
declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
63
40
declare variable $local:processed-external-links := xs:QName("processed-external-links");
64
41
65
42
declare %ann:sequential function local:create-containers()
66
43
{
67
44
  map:create($local:processed-internal-links, xs:QName("xs:string"));
68
45
  map:create($local:processed-external-links, xs:QName("xs:string"));
69
46
};
70
47
71
48
declare %ann:sequential function local:delete-containers(){
72
49
  for $x in map:available-maps()
73
50
  return map:delete($x);
74
51
};
75
52
76
53
\endcode
77
54
78
55
After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed.
79
56
The html module uses tidy library, so we use tidy options to setup for converting from html to xml. 
80
57
Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website. 
81
58
You can add or remove tags to suit your website needs.
82
59
83
60
\code
84
61
declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
85
62
{  distinct-values( for $y in  ($content//*:a/string(@href),
86
63
                              $content//*:link/string(@href),
87
64
                              $content//*:script/string(@src),
88
65
                              $content//*:img/string(@src),
89
66
                              $content//*:area/string(@href)
90
67
                              )
91
68
return  local:get-real-link($y, $uri))
92
69
};
93
70
94
71
declare function local:tidy-options()
95
72
{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
96
73
                                         <tidyParam name="output-xml" value="yes" />
97
74
                                         <tidyParam name="doctype" value="omit" />
98
75
                                         <tidyParam name="quote-nbsp" value="no" />
99
76
                                         <tidyParam name="char-encoding" value="utf8" />
100
77
                                         <tidyParam name="newline" value="LF" />
101
78
                                         <tidyParam name="tidy-mark" value="no" />
102
79
                                         <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
103
80
                                       </options>
104
81
};
105
82
106
83
declare  %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
107
84
      if($n=3) then exit returning (); else {}
108
85
      if(not(empty(map:get($local:processed-internal-links, $x))))
109
86
            then exit returning false();
110
87
              else {}
111
88
       variable $http-call:=();
112
89
       try{
113
90
             $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
114
91
       }
115
92
       catch * {}
116
93
      if( not(local:alive($http-call)))
117
94
                then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
118
95
               else {}
119
96
       if(not (local:get-media-type($http-call[1]) = $supported-media-types))
120
97
                then {map:insert($local:processed-internal-links, "clean", $x);  exit returning ();}
121
98
                else {}
122
99
       variable $string-content := xs:string($http-call[2]);
123
100
       variable $content:=();
124
101
125
102
       try{
126
103
             $content:=html:parse($string-content,local:tidy-options() );
127
104
        }
128
105
        catch *
129
106
             { 
130
107
                 map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); 
131
108
                 try{
132
109
                       $content:=parse-xml:parse-xml-fragment ($string-content, "");
133
110
                 }
134
111
                 catch *
135
112
                     { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
136
113
            }
137
114
       variable $links :=();
138
115
       if(empty($content))
139
116
            then $links:=local:get-out-links-unparsed($string-content, $x);
140
117
           else $links:=local:get-out-links-parsed($content, $x);
141
118
       for $l in $links
142
119
       return  local:process-link($l, $n+1);
143
120
};
144
121
145
122
\endcode
146
123
147
124
Some html pages have errors, and tidy library is very strict with checking errors. 
148
125
When the parsing fails, we fallback to using regex for extracting the links.
149
126
150
127
\code
151
128
declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
152
129
153
130
      distinct-values( 
154
131
         let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
155
132
         for $other-uri2 in  $search//group[@nr=8]/string()
156
133
         let $y:= fn:normalize-space($other-uri2)
157
134
         return local:get-real-link($y, $uri)
158
135
     )
159
136
};
160
137
161
138
\endcode
162
139
163
140
For external links, we just check if they exist, so the http command requests only for HEAD.
164
141
165
142
\code
166
143
declare  %ann:sequential function local:process-external-link($x as xs:string){
167
144
  if(not(empty(map:get($local:processed-external-links, $x))))
168
145
         then   exit returning false();
169
146
         else {}
170
147
 variable $http-call:=();
171
148
  try{
172
149
        $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
173
150
  }
174
151
  catch * {}
175
152
  if( local:alive($http-call))
176
153
          then map:insert($local:processed-external-links, "clean", $x);
177
154
          else map:insert($local:processed-external-links, "broken", $x);
178
155
};
179
156
180
157
\endcode
181
158
182
159
After parsing, the results are returned in xml format.
183
160
184
161
\code
185
162
declare function local:print-results() as element()*
186
163
{
187
164
    for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
188
165
    return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>, 
189
166
    
190
167
    for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
191
168
    return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
192
169
};
193
170
194
171
\endcode
195
172
196
173
*/
197
0
\ No newline at end of file
174
\ No newline at end of file
Status:	Merged
Approved by:	Chris Hillery on 2011-10-04
Approved revision:	10470
Merged at revision:	10489
Proposed branch:	lp:~danielturcanu/zorba/web_crawler_tutorial
Merge into:	lp:zorba
Diff against target:	197 lines (+181/-0) 2 files modified doc/zorba/indexpage.dox.in (+8/-0) doc/zorba/web_crawler.dox (+173/-0)
To merge this branch:	bzr merge lp:~danielturcanu/zorba/web_crawler_tutorial
Related bugs:	Link a bug report
Reviewer	Date Requested	Status
Chris Hillery	2011-10-04	Approve on 2011-10-04
Sorin Marian Nasoi	2011-09-27	Abstain on 2011-10-04
Review via email: mp+77179@code.launchpad.net