1
=== modified file 'download_content_yes_to_lic.py'
2
--- download_content_yes_to_lic.py	2012-01-09 07:18:32 +0000
3
+++ download_content_yes_to_lic.py	2012-05-04 13:33:19 +0000
4
@@ -1,11 +1,14 @@
5
1
# Changes required to address EULA for the origen hwpacks
6
2
7
1
#!/usr/bin/env python
3
#!/usr/bin/env python
8
2
4
11
3
# Changes required to address EULA for the origen hwpacks
5
import argparse
10
4
12
5
import os
6
import os
13
6
import pycurl
7
import pycurl
14
7
import re
8
import re
15
8
import urlparse
9
import urlparse
16
10
import html2text
17
11
from BeautifulSoup import BeautifulSoup
18
9
12
19
10
class LicenseProtectedFileFetcher:
13
class LicenseProtectedFileFetcher:
20
11
    """Fetch a file from the web that may be protected by a license redirect
14
    """Fetch a file from the web that may be protected by a license redirect
21
@@ -27,25 +30,104 @@
22
27
    downloads.
30
    downloads.
23
28
31
24
29
    """
32
    """
26
30
    def __init__(self):
33
    def __init__(self, cookie_file="cookies.txt"):
27
31
        """Set up cURL"""
34
        """Set up cURL"""
28
32
        self.curl = pycurl.Curl()
35
        self.curl = pycurl.Curl()
29
33
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
30
34
        self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
36
        self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
31
35
        self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
37
        self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
34
36
        self.curl.setopt(pycurl.COOKIEFILE, "cookies.txt")
38
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
35
37
        self.curl.setopt(pycurl.COOKIEJAR, "cookies.txt")
39
        self.curl.setopt(pycurl.COOKIEFILE, cookie_file)
36
40
        self.curl.setopt(pycurl.COOKIEJAR, cookie_file)
37
41
        self.file_out = None
38
38
42
39
39
    def _get(self, url):
43
    def _get(self, url):
40
40
        """Clear out header and body storage, fetch URL, filling them in."""
44
        """Clear out header and body storage, fetch URL, filling them in."""
49
41
        self.curl.setopt(pycurl.URL, url)
45
        url = url.encode("ascii")
50
42
46
        self.curl.setopt(pycurl.URL, url)
51
43
        self.body = ""
47
52
44
        self.header = ""
48
        self.body = ""
53
45
49
        self.header = ""
54
46
        self.curl.perform()
50
55
47
51
        if self.file_name:
56
48
    def get(self, url):
52
            self.file_out = open(self.file_name, 'w')
57
53
        else:
58
54
            self.file_out = None
59
55
60
56
        self.curl.perform()
61
57
        self._parse_headers(url)
62
58
63
59
        if self.file_out:
64
60
            self.file_out.close()
65
61
66
62
    def _parse_headers(self, url):
67
63
        header = {}
68
64
        for line in self.header.splitlines():
69
65
            # Header lines typically are of the form thing: value...
70
66
            test_line = re.search("^(.*?)\s*:\s*(.*)$", line)
71
67
72
68
            if test_line:
73
69
                header[test_line.group(1)] = test_line.group(2)
74
70
75
71
        # The location attribute is sometimes relative, but we would
76
72
        # like to have it as always absolute...
77
73
78
74
        if 'Location' in header.keys():
79
75
            parsed_location = urlparse.urlparse(header["Location"])
80
76
81
77
            # If not an absolute location...
82
78
            if not parsed_location.netloc:
83
79
                parsed_source_url = urlparse.urlparse(url)
84
80
                new_location = ["", "", "", "", ""]
85
81
86
82
                new_location[0] = parsed_source_url.scheme
87
83
                new_location[1] = parsed_source_url.netloc
88
84
                new_location[2] = header["Location"]
89
85
90
86
                # Update location with absolute URL
91
87
                header["Location"] = urlparse.urlunsplit(new_location)
92
88
93
89
        self.header_text = self.header
94
90
        self.header = header
95
91
96
92
    def get_headers(self, url):
97
93
        url = url.encode("ascii")
98
94
        self.curl.setopt(pycurl.URL, url)
99
95
100
96
        self.body = ""
101
97
        self.header = ""
102
98
103
99
        # Setting NOBODY causes CURL to just fetch the header.
104
100
        self.curl.setopt(pycurl.NOBODY, True)
105
101
        self.curl.perform()
106
102
        self.curl.setopt(pycurl.NOBODY, False)
107
103
108
104
        self._parse_headers(url)
109
105
110
106
        return self.header
111
107
112
108
    def get_or_return_license(self, url, file_name=None):
113
109
        """Get file at the requested URL or, if behind a license, return that.
114
110
115
111
        If the URL provided does not redirect us to a license, then return the
116
112
        body of that file. If we are redirected to a license click through
117
113
        then return (the license as plain text, url to accept the license).
118
114
119
115
        If the user of this function accepts the license, then they should
120
116
        call get_protected_file."""
121
117
122
118
        self.file_name = file_name
123
119
124
120
        # Get the license details. If this returns None, the file isn't license
125
121
        # protected and we can just return the file we started to get in the
126
122
        # function (self.body).
127
123
        license_details = self._get_license(url)
128
124
129
125
        if license_details:
130
126
            return license_details
131
127
132
128
        return self.body
133
129
134
130
    def get(self, url, file_name=None):
135
49
        """Fetch the requested URL, accepting licenses, returns file body
131
        """Fetch the requested URL, accepting licenses, returns file body
136
50
132
137
51
        Fetches the file at url. If a redirect is encountered, it is
133
        Fetches the file at url. If a redirect is encountered, it is
138
@@ -53,13 +135,34 @@
139
53
        then download the original file.
135
        then download the original file.
140
54
136
141
55
        """
137
        """
149
56
        self._get(url)
138
150
57
139
        self.file_name = file_name
151
58
        location = self._get_location()
140
        license_details = self._get_license(url)
152
59
        if location:
141
153
60
            # Off to the races - we have been redirected.
142
        if license_details:
154
61
            # Expect to find a link to self.location with -accepted inserted
143
            # Found a license. Accept the license without looking at it and
155
62
            # before the .html, i.e. ste.html -> ste-accepted.html
144
            # start fetching the file we originally wanted.
156
145
            accept_url = license_details[1]
157
146
            self.get_protected_file(accept_url, url)
158
147
159
148
        else:
160
149
            # If we got here, there wasn't a license protecting the file
161
150
            # so we just fetch it.
162
151
            self._get(url)
163
152
164
153
        return self.body
165
154
166
155
    def _get_license(self, url):
167
156
        """Return (license, accept URL) if found, else return None"""
168
157
169
158
        self.get_headers(url)
170
159
171
160
        if "Location" in self.header and self.header["Location"] != url:
172
161
            # We have been redirected to a new location - the license file
173
162
            location = self.header["Location"]
174
163
175
164
            # Fetch the license HTML
176
165
            self._get(location)
177
63
166
178
64
            # Get the file from the URL (full path)
167
            # Get the file from the URL (full path)
179
65
            file = urlparse.urlparse(location).path
168
            file = urlparse.urlparse(location).path
180
@@ -68,50 +171,64 @@
181
68
            file = os.path.split(file)[-1]
171
            file = os.path.split(file)[-1]
182
69
172
183
70
            # Look for a link with accepted.html in the page name. Follow it.
173
            # Look for a link with accepted.html in the page name. Follow it.
184
71
            new_file = None
185
72
            for line in self.body.splitlines():
174
            for line in self.body.splitlines():
186
73
                link_search = re.search("""href=.*?["'](.*?-accepted.html)""",
175
                link_search = re.search("""href=.*?["'](.*?-accepted.html)""",
187
74
                                        line)
176
                                        line)
188
75
                if link_search:
177
                if link_search:
189
76
                    # Have found license accept URL!
178
                    # Have found license accept URL!
190
77
                    new_file = link_search.group(1)
179
                    new_file = link_search.group(1)
214
78
180
                    accept_url = re.sub(file, new_file, location)
215
79
            if new_file:
181
216
80
                # Accept the license...
182
                    # Parse the HTML using BeautifulSoup
217
81
                accept_url = re.sub(file, new_file, location)
183
                    soup = BeautifulSoup(self.body)
218
82
                self._get(accept_url)
184
219
83
185
                    # The license is in a div with the ID license-text, so we
220
84
                # The above get *should* take us to the file requested via
186
                    # use this to pull just the license out of the HTML.
221
85
                # a redirect. If we manually need to follow that redirect,
187
                    html_license = u""
222
86
                # do that now.
188
                    for chunk in soup.findAll(id="license-text"):
223
87
189
                        # Output of chunk.prettify is UTF8, but comes back
224
88
                if self._get_location():
190
                        # as a str, so convert it here.
225
89
                    # If we haven't been redirected to our original file,
191
                        html_license += chunk.prettify().decode("utf-8")
226
90
                    # we should be able to just download it now.
192
227
91
                    self._get(url)
193
                    text_license = html2text.html2text(html_license)
228
92
194
229
93
        return self.body
195
                    return text_license, accept_url
230
94
196
208
95
    def _search_header(self, field):
209
96
        """Search header for the supplied field, return field / None"""
210
97
        for line in self.header.splitlines():
211
98
            search = re.search(field + ":\s+(.*?)$", line)
212
99
            if search:
213
100
                return search.group(1)
231
101
        return None
197
        return None
232
102
198
236
103
    def _get_location(self):
199
    def get_protected_file(self, accept_url, url):
237
104
        """Return content of Location field in header / None"""
200
        """Gets the file redirected to by the accept_url"""
238
105
        return self._search_header("Location")
201
239
202
        self._get(accept_url)  # Accept the license
240
203
241
204
        if not("Location" in self.header and self.header["Location"] == url):
242
205
            # If we got here, we don't have the file yet (weren't redirected
243
206
            # to it). Fetch our target file. This should work now that we have
244
207
            # the right cookie.
245
208
            self._get(url)  # Download the target file
246
209
247
210
        return self.body
248
106
211
249
107
    def _write_body(self, buf):
212
    def _write_body(self, buf):
250
108
        """Used by curl as a sink for body content"""
213
        """Used by curl as a sink for body content"""
252
109
        self.body += buf
214
253
215
        # If we have a target file to write to, write to it
254
216
        if self.file_out:
255
217
            self.file_out.write(buf)
256
218
257
219
        # Only buffer first 1MB of body. This should be plenty for anything
258
220
        # we wish to parse internally.
259
221
        if len(self.body) < 1024*1024*1024:
260
222
            self.body += buf
261
110
223
262
111
    def _write_header(self, buf):
224
    def _write_header(self, buf):
263
112
        """Used by curl as a sink for header content"""
225
        """Used by curl as a sink for header content"""
264
113
        self.header += buf
226
        self.header += buf
265
114
227
266
228
    def register_progress_callback(self, callback):
267
229
        self.curl.setopt(pycurl.NOPROGRESS, 0)
268
230
        self.curl.setopt(pycurl.PROGRESSFUNCTION, callback)
269
231
270
115
    def close(self):
232
    def close(self):
271
116
        """Wrapper to close curl - this will allow curl to write out cookies"""
233
        """Wrapper to close curl - this will allow curl to write out cookies"""
272
117
        self.curl.close()
234
        self.curl.close()
273
118
235
274
=== modified file 'download_file'
275
--- download_file	2012-01-09 07:18:32 +0000
276
+++ download_file	2012-05-04 13:33:19 +0000
277
@@ -8,7 +8,7 @@
278
8
import urlparse
8
import urlparse
279
9
import os
9
import os
280
10
10
282
11
#Download file specified on command line
11
"""Download file specified on command line"""
283
12
parser = argparse.ArgumentParser(description="Download a file, accepting "
12
parser = argparse.ArgumentParser(description="Download a file, accepting "
284
13
                                "any licenses required to do so.")
13
                                "any licenses required to do so.")
285
14
14
286
@@ -18,17 +18,11 @@
287
18
args = parser.parse_args()
18
args = parser.parse_args()
288
19
19
289
20
fetcher = LicenseProtectedFileFetcher()
20
fetcher = LicenseProtectedFileFetcher()
290
21
content = fetcher.get(args.url[0])
291
22
21
292
23
# Get file name from URL
22
# Get file name from URL
293
24
file_name = os.path.basename(urlparse.urlparse(args.url[0]).path)
23
file_name = os.path.basename(urlparse.urlparse(args.url[0]).path)
294
25
295
26
# If file name can not be found (for example, we have got a directory
296
27
# index), provide a default.
297
28
if not file_name:
24
if not file_name:
299
29
    file_name = "unnamed.out"
25
    file_name = "downloaded"
300
30
26
304
31
out = open(file_name, 'w')
27
fetcher.get(args.url[0], file_name)
302
32
out.write(content)
303
33
out.close()
305
34
fetcher.close()
28
fetcher.close()
306
35
29
307
=== modified file 'find_latest.py'
308
--- find_latest.py	2012-04-30 07:40:00 +0000
309
+++ find_latest.py	2012-05-04 13:33:19 +0000
310
@@ -127,7 +127,8 @@
311
127
    :param url: The base url to search
127
    :param url: The base url to search
312
128
    :param extra: The extra path needed to complete the url
128
    :param extra: The extra path needed to complete the url
313
129
    """
129
    """
315
130
    builddates = geturl(url)
130
    fetcher = LicenseProtectedFileFetcher()
316
131
    builddates = fetcher.get(url)
317
131
    dates = find_ci_builds(builddates)
132
    dates = find_ci_builds(builddates)
318
132
    dates = sorted(dates, key=lambda x: x[1])
133
    dates = sorted(dates, key=lambda x: x[1])
319
133
134
320
@@ -139,4 +140,5 @@
321
139
                raise StopIteration()
140
                raise StopIteration()
322
140
    except StopIteration:
141
    except StopIteration:
323
141
        pass
142
        pass
324
143
    fetcher.close()
325
142
    return filename
144
    return filename
Status:	Merged
Approved by:	James Tunnicliffe on 2012-05-04
Approved revision:	61
Merged at revision:	61
Proposed branch:	lp:~deeptik/linaro-ci/fix_bug_994573_lava_submission
Merge into:	lp:linaro-ci
Diff against target:	325 lines (+172/-59) 3 files modified download_content_yes_to_lic.py (+166/-49) download_file (+3/-9) find_latest.py (+3/-1)
To merge this branch:	bzr merge lp:~deeptik/linaro-ci/fix_bug_994573_lava_submission
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
James Tunnicliffe (community)		2012-05-04	Approve on 2012-05-04
Review via email: mp+104737@code.launchpad.net