Merge lp:~max-rabkin/ibid/google-translate into lp:~ibid-core/ibid/old-trunk-1.6

Proposed by Max Rabkin
Status: Merged
Approved by: Jonathan Hitchcock
Approved revision: not available
Merged at revision: 821
Proposed branch: lp:~max-rabkin/ibid/google-translate
Merge into: lp:~ibid-core/ibid/old-trunk-1.6
Diff against target: 182 lines (+59/-73)
1 file modified
ibid/plugins/google.py (+59/-73)
To merge this branch: bzr merge lp:~max-rabkin/ibid/google-translate
Reviewer Review Type Date Requested Status
Jonathan Hitchcock Approve
Michael Gorven Approve
Stefano Rivera Approve
Review via email: mp+16691@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Max Rabkin (max-rabkin) wrote :

Using a hard-coded language list means we can tell users which languages we support, and we can use a normal @match to extract the arguments. It also means we use Google's language codes, which are not the same as ISO 639-1: they use "iw" for Hebrew instead of "he" (but accept both), and other functions in the Language API use the ISO 639-2 code for Cherokee (there is no 639-1 code). Also, we can read the languages right there in the code, so no more surprises like Greek.

Of course, there are the usual disadvantages of hard-coded values: basically, when Google adds languages, we'll have to update the list.

Revision history for this message
Stefano Rivera (stefanor) wrote :

I see a conflict in the diff below.

lp:~max-rabkin/ibid/google-translate updated
827. By Max Rabkin

merge

Revision history for this message
Max Rabkin (max-rabkin) wrote :

On Thu, Dec 31, 2009 at 7:58 PM, Stefano Rivera <email address hidden> wrote:
> I see a conflict in the diff below.

Thanks, fixed.

Revision history for this message
Stefano Rivera (stefanor) wrote :

Your merge request also seems to modify ibid.ini

lp:~max-rabkin/ibid/google-translate updated
828. By Max Rabkin

undo changes to ibid.ini

Revision history for this message
Max Rabkin (max-rabkin) wrote :

> Your merge request also seems to modify ibid.ini

Gah, fixed.

Revision history for this message
Stefano Rivera (stefanor) wrote :

ibid/plugins/google.py:1: 'codecs' imported but unused
ibid/plugins/google.py:11: 'cacheable_download' imported but unused
ibid/plugins/google.py:13: 'get_html_parse_tree' imported but unused

Besides those I approve

review: Approve
Revision history for this message
Michael Gorven (mgorven) wrote :

Looks fine.
 review approve

review: Approve
Revision history for this message
Jonathan Hitchcock (vhata) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'ibid/plugins/google.py'
2--- ibid/plugins/google.py 2009-12-30 21:21:20 +0000
3+++ ibid/plugins/google.py 2010-01-02 11:17:13 +0000
4@@ -9,6 +9,8 @@
5 from ibid.plugins import Processor, match
6 from ibid.config import Option, IntOption
7 from ibid.utils import decode_htmlentities, json_webservice, cacheable_download
8+from ibid.utils import human_join
9+from ibid.utils.html import get_html_parse_tree
10
11 help = {'google': u'Retrieves results from Google and Google Calculator.'}
12
13@@ -135,24 +137,66 @@
14
15 api_key = Option('api_key', 'Your Google API Key (optional)', None)
16 referer = Option('referer', 'The referer string to use (API searches)', default_referer)
17- dest_lang = Option('dest_lang', 'Destination language when none is specified', 'en')
18+ dest_lang = Option('dest_lang', 'Destination language when none is specified', 'english')
19
20 chain_length = IntOption('chain_length', 'Maximum length of translation chains', 10)
21
22- @match(r'^translate\s+(.*)$')
23- def translate (self, event, data):
24+ lang_names = {'afrikaans':'af', 'albanian':'sq', 'arabic':'ar',
25+ 'belarusian':'be', 'bulgarian':'bg', 'catalan':'ca',
26+ 'chinese':'zh', 'chinese simplified':'zh-cn',
27+ 'chinese traditional':'zh-tw', 'croatian':'hr', 'czech':'cs',
28+ 'danish':'da', 'dutch':'nl', 'english':'en', 'estonian':'et',
29+ 'filipino':'tl', 'finnish':'fi', 'french':'fr',
30+ 'galacian':'gl', 'german':'de', 'greek':'el', 'hebrew':'iw',
31+ 'hindi':'hi', 'hungarian':'hu', 'icelandic':'is',
32+ 'indonesian':'id', 'irish':'ga', 'italian':'it',
33+ 'japanese':'ja', 'korean': 'ko', 'latvian':'lv',
34+ 'lithuanian':'lt', 'macedonian':'mk', 'malay':'ms',
35+ 'maltese':'mt', 'norwegian':'no', 'persian':'fa',
36+ 'polish':'pl', 'portuguese':'pt', 'romanian':'ro',
37+ 'russian': 'ru', 'serbian':'sr', 'slovak':'sk',
38+ 'slovenian':'sl', 'spanish':'es', 'swahili':'sw',
39+ 'swedish':'sv', 'thai':'th', 'turkish':'tr', 'ukrainian':'uk',
40+ 'uzbek': 'uz', 'vietnamese':'vi', 'welsh':'cy',
41+ 'yiddish':'yi'}
42+
43+ alt_lang_names = {'simplified':'zh-CN', 'simplified chinese':'zh-CN',
44+ 'traditional':'zh-TW', 'traditional chinese':'zh-TW',
45+ 'bokmal':'no', 'norwegian bokmal':'no',
46+ u'bokm\N{LATIN SMALL LETTER A WITH RING ABOVE}l':'no',
47+ u'norwegian bokm\N{LATIN SMALL LETTER A WITH RING ABOVE}l':
48+ 'no',
49+ 'farsi':'fa'}
50+
51+ LANG_REGEX = '|'.join(lang_names.keys() + lang_names.values() +
52+ alt_lang_names.keys())
53+
54+ @match(r'^(?:translation\s*)?languages$')
55+ def languages (self, event):
56+ event.addresponse(human_join(sorted(self.lang_names.keys())))
57+
58+ @match(r'^translate\s+(.*?)(?:\s+from\s+(' + LANG_REGEX + r'))?'
59+ r'(?:\s+(?:in)?to\s+(' + LANG_REGEX + r'))?$')
60+ def translate (self, event, text, src_lang, dest_lang):
61+ dest_lang = self.language_code(dest_lang or self.dest_lang)
62+ src_lang = self.language_code(src_lang or '')
63+
64 try:
65- translated = self._translate(event, *self._parse_request(data))[0]
66+ translated = self._translate(event, text, src_lang, dest_lang)[0]
67 event.addresponse(translated)
68 except TranslationException, e:
69 event.addresponse(u"I couldn't translate that: %s.", unicode(e))
70
71- @match(r'^translation[-\s]*(?:chain|party)\s+(.*)$')
72- def translation_chain (self, event, data):
73+ @match(r'^translation[-\s]*(?:chain|party)\s+(.*?)'
74+ r'(?:\s+from\s+(' + LANG_REGEX + r'))?'
75+ r'(?:\s+(?:in)?to\s+(' + LANG_REGEX + r'))?$')
76+ def translation_chain (self, event, phrase, src_lang, dest_lang):
77 if self.chain_length < 1:
78 event.addresponse(u"I'm not allowed to play translation games.")
79 try:
80- phrase, src_lang, dest_lang = self._parse_request(data)
81+ dest_lang = self.language_code(dest_lang or self.dest_lang)
82+ src_lang = self.language_code(src_lang or '')
83+
84 chain = [phrase]
85 for i in range(self.chain_length):
86 phrase, src_lang = self._translate(event, phrase,
87@@ -167,38 +211,6 @@
88 except TranslationException, e:
89 event.addresponse(u"I couldn't translate that: %s.", unicode(e))
90
91- def _parse_request (self, data):
92- if not hasattr(self, 'lang_names'):
93- self._make_language_dict()
94-
95- from_re = r'\s+from\s+(?P<from>[-()\s\w]+?)'
96- to_re = r'\s+(?:in)?to\s+(?P<to>[-()\s\w]+?)'
97-
98- res = [(from_re, to_re), (to_re, from_re), (to_re,), (from_re,), ()]
99-
100- # Try all possible specifications of source and target language until we
101- # find a valid one.
102- for pat in res:
103- pat = '(?P<text>.*)' + ''.join(pat) + '\s*$'
104- m = re.match(pat, data, re.IGNORECASE | re.UNICODE | re.DOTALL)
105- if m:
106- dest_lang = m.groupdict().get('to')
107- src_lang = m.groupdict().get('from')
108- try:
109- if dest_lang:
110- dest_lang = self.language_code(dest_lang)
111- else:
112- dest_lang = self.dest_lang
113-
114- if src_lang:
115- src_lang = self.language_code(src_lang)
116- else:
117- src_lang = ''
118-
119- return (m.group('text'), src_lang, dest_lang)
120- except UnknownLanguageException:
121- continue
122-
123 def _translate (self, event, phrase, src_lang, dest_lang):
124 params = {
125 'v': '1.0',
126@@ -234,48 +246,22 @@
127
128 raise TranslationException(msg)
129
130- def _make_language_dict (self):
131- self.lang_names = d = {}
132-
133- filename = cacheable_download('http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt',
134- 'google/ISO-639-2_utf-8.txt')
135- f = codecs.open(filename, 'rU', 'utf-8')
136- for line in f:
137- code2B, code2T, code1, englishNames, frenchNames = line.split('|')
138-
139- # Identify languages by ISO 639-1 code if it exists; otherwise use
140- # ISO 639-2 (B). Google currently only translates languages with -1
141- # codes, but will may use -2 (B) codes in the future.
142- ident = code1 or code2B
143-
144- d[code2B] = d[code2T] = d[code1] = ident
145- for name in englishNames.lower().split(';'):
146- d[name] = ident
147-
148- del d['']
149-
150 def language_code (self, name):
151- """Convert a name to a language code.
152-
153- Caller must call _make_language_dict first."""
154+ """Convert a name to a language code."""
155
156 name = name.lower()
157
158- m = re.match('^([a-z]{2})(?:-[a-z]{2})?$', name)
159- if m and m.group(1) in self.lang_names:
160+ if name == '':
161 return name
162- if 'simplified' in name:
163- return 'zh-CN'
164- if 'traditional' in name:
165- return 'zh-TW'
166- if re.search(u'bokm[a\N{LATIN SMALL LETTER A WITH RING ABOVE}]l', name):
167- # what Google calls Norwegian seems to be Bokmal
168- return 'no'
169
170 try:
171- return self.lang_names[name]
172+ return self.lang_names.get(name) or self.alt_lang_names[name]
173 except KeyError:
174- raise UnknownLanguageException
175+ m = re.match('^([a-z]{2,3})(?:-[a-z]{2})?$', name)
176+ if m and m.group(1) in self.lang_names.values():
177+ return name
178+ else:
179+ raise UnknownLanguageException
180
181 # This Plugin uses code from youtube-dl
182 # Copyright (c) 2006-2008 Ricardo Garcia Gonzalez

Subscribers

People subscribed via source and target branches