Merge lp:~xnox/ubuntu-geonames/fix-duplicates into lp:ubuntu-geonames

Proposed by Dimitri John Ledkov
Status: Merged
Merged at revision: 17
Proposed branch: lp:~xnox/ubuntu-geonames/fix-duplicates
Merge into: lp:ubuntu-geonames
Diff against target: 166 lines (+37/-33)
4 files modified
README (+7/-7)
geoname-modpython.py (+15/-8)
import-geonames.sh (+14/-17)
sphinx.conf (+1/-1)
To merge this branch: bzr merge lp:~xnox/ubuntu-geonames/fix-duplicates
Reviewer Review Type Date Requested Status
Evan (community) Approve
Review via email: mp+120911@code.launchpad.net

Description of the change

* This fixes database import due to data format changes
* Also fixes documentation and sample config
* This updates geoname-modpython.py query
* This does not _fix_ cherrypy version geoname.py (as it appears to not be used / broken ?!)

Instead of doing sql query for each potential location, we build the two location lists (for normal and alternative names) and execute a single query across both tables with a UNION. As a side-effect, UNION removes duplicate entries. Thus instead of 20 New Yorks, we get only 4.

If you are deploying this, please note:
- Due to database changes it's probably best to drop the whole database, recreate it & re-import the dataset.
- And don't forget the re-index with sphinx.

To post a comment you must log in.
Revision history for this message
Evan (ev) wrote :

Yes, the cherrypy version was the initially developed one, which was dropped for the mod_python variant.

Revision history for this message
Evan (ev) wrote :

Looks good! Please file an RT for deployment.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'README'
2--- README 2010-09-03 07:35:55 +0000
3+++ README 2012-08-23 04:40:25 +0000
4@@ -1,12 +1,12 @@
5 - Install postgres and create a 'geonames' database.
6-- Download Geonames from http://download.geonames.org/export/dump
7+- Create a geouser account with password geopw
8 - Run import-geonames.sh
9-- Create a geouser account with password geopw, grant permissions on the table:
10- GRANT ALL PRIVILEGES ON geoname, admin1codes, countryInfo TO geouser
11- (slightly excessive, it just needs to query)
12-- Install and configure Sphinx (/etc/sphinxsearch/sphinx.conf ) from universe.
13-- Create an index on geonameid which should reduce query times:
14- CREATE INDEX geoname_id_idx ON geoname(geonameid);
15+ + it downloads & populates the database
16+ + grants slightly excessive permissions to geouser for queries
17+- Install sphinxsearch and configure (/etc/sphinxsearch/sphinx.conf ) from universe.
18+ + run $ sudo indexer geonames
19+ + Enable /etc/default/sphinxsearch
20+ + start $ sudo service sphinxsearch start
21 - Install sphinxapi-python from ppa:~ev and use or modify the sphinx.conf
22 provided.
23 - Install CherryPy from universe.
24
25=== modified file 'geoname-modpython.py'
26--- geoname-modpython.py 2011-04-06 20:33:16 +0000
27+++ geoname-modpython.py 2012-08-23 04:40:25 +0000
28@@ -5,11 +5,12 @@
29 statement = "SELECT geoname.name, admin1codes.name, countryInfo.name, \
30 geoname.longitude, geoname.latitude FROM admin1codes, geoname, countryInfo \
31 WHERE code = geoname.country||'.'||geoname.admin1 AND \
32-countryInfo.iso_alpha2=geoname.country AND geoname.geonameid=%s;"
33-altstatement = "SELECT alternatename.alternateName, admin1codes.name, countryInfo.name, \
34+countryInfo.iso_alpha2=geoname.country AND geoname.geonameid in %s \
35+UNION \
36+SELECT alternatename.alternateName, admin1codes.name, countryInfo.name, \
37 geoname.longitude, geoname.latitude FROM alternatename, admin1codes, geoname, countryInfo \
38 WHERE code = geoname.country||'.'||geoname.admin1 AND \
39-countryInfo.iso_alpha2=geoname.country AND alternatename.geonameid=geoname.geonameid AND alternatename.alternatenameId=%s;"
40+countryInfo.iso_alpha2=geoname.country AND alternatename.geonameid=geoname.geonameid AND alternatename.alternatenameId in %s;"
41 authstring = 'dbname=geonames user=geouser password=geopw host=localhost'
42 jsonheader = '['
43 jsonfooter = ']'
44@@ -31,17 +32,23 @@
45 connection = psycopg2.connect(authstring)
46 cursor = connection.cursor()
47 try:
48+ # We need at least one value for the sql in operator
49+ # and there are no locations with id 0
50+ statement_ids = [0]
51+ altstatement_ids = [0]
52 for x in result:
53 rawid = x['id']
54 idval = rawid / 10
55 idtype = rawid % 10
56 if idtype == 1:
57- cursor.execute(statement % idval)
58+ statement_ids.append(idval)
59 else:
60- cursor.execute(altstatement % idval)
61- record = cursor.fetchone()
62- if record:
63- ret.append(jsonentry % record)
64+ altstatement_ids.append(idval)
65+
66+ cursor.execute(statement, (tuple(statement_ids), tuple(altstatement_ids)))
67+ records = cursor.fetchall()
68+ for record in records:
69+ ret.append(jsonentry % record)
70 finally:
71 cursor.close()
72 connection.close()
73
74=== modified file 'import-geonames.sh'
75--- import-geonames.sh 2011-04-07 15:10:15 +0000
76+++ import-geonames.sh 2012-08-23 04:40:25 +0000
77@@ -9,7 +9,7 @@
78 # allCountries.zip contains allCountries.txt
79 # alternateNames.zip contains iso-languagecodes.txt alternateNames.txt
80 ZIPFILES="allCountries.zip alternateNames.zip"
81-TXTFILES="admin1Codes.txt admin1CodesASCII.txt countryInfo.txt timeZones.txt"
82+TXTFILES="admin1CodesASCII.txt countryInfo.txt timeZones.txt"
83 for i in $ZIPFILES $TXTFILES
84 do
85 wget "http://download.geonames.org/export/dump/$i"
86@@ -24,14 +24,14 @@
87 grep -v '^#' countryInfo.txt > countryInfo.txt.tmp
88 tail -n +2 timeZones.txt > timeZones.txt.tmp
89
90-sudo -u postgres psql geonames <<EOT
91+psql geonames <<EOT
92 BEGIN;
93 DROP TABLE IF EXISTS geoname;
94 CREATE TABLE geoname (
95 geonameid int,
96 name varchar(200),
97 asciiname varchar(200),
98- alternatenames varchar(6000),
99+ alternatenames varchar(10000),
100 latitude float,
101 longitude float,
102 fclass char(1),
103@@ -57,9 +57,11 @@
104 isoLanguage varchar(7),
105 alternateName varchar(200),
106 isPreferredName boolean,
107- isShortName boolean
108+ isShortName boolean,
109+ isColloquial boolean,
110+ isHistoric boolean
111 );
112-copy alternatename (alternatenameid,geonameid,isoLanguage,alternateName,isPreferredName,isShortName) from '$WORKPATH/alternateNames.txt' null as '';
113+copy alternatename (alternatenameid,geonameid,isoLanguage,alternateName,isPreferredName,isShortName,isColloquial,isHistoric) from '$WORKPATH/alternateNames.txt' null as '';
114
115 DROP TABLE IF EXISTS countryinfo;
116 CREATE TABLE countryinfo (
117@@ -94,29 +96,24 @@
118 );
119 copy iso_languagecodes (iso_639_3, iso_639_2, iso_639_1, language_name) from '$WORKPATH/iso-languagecodes.txt.tmp' null as '';
120
121-DROP TABLE IF EXISTS admin1Codes;
122-CREATE TABLE admin1Codes (
123- code varchar(10),
124- name TEXT
125-);
126-copy admin1Codes (code, name) from '$WORKPATH/admin1Codes.txt' null as '';
127-
128-DROP TABLE IF EXISTS admin1CodesAscii;
129-CREATE TABLE admin1CodesAscii (
130+DROP TABLE IF EXISTS admin1codes;
131+CREATE TABLE admin1codes (
132 code varchar(10),
133 name TEXT,
134 nameAscii TEXT,
135 geonameid int
136 );
137-copy admin1CodesAscii (code,name,nameAscii,geonameid) from '$WORKPATH/admin1CodesASCII.txt' null as '';
138+copy admin1codes (code,name,nameAscii,geonameid) from '$WORKPATH/admin1CodesASCII.txt' null as '';
139
140 DROP TABLE IF EXISTS timeZones;
141 CREATE TABLE timeZones (
142+ code CHAR(2),
143 timeZoneId VARCHAR(200),
144 GMT_offset numeric(3,1),
145- DST_offset numeric(3,1)
146+ DST_offset numeric(3,1),
147+ RAW_offset numeric(3,1)
148 );
149-copy timeZones (timeZoneId,GMT_offset,DST_offset) from '$WORKPATH/timeZones.txt.tmp' null as '';
150+copy timeZones (code,timeZoneId,GMT_offset,DST_offset,RAW_offset) from '$WORKPATH/timeZones.txt.tmp' null as '';
151
152 DROP TABLE IF EXISTS continentCodes;
153 CREATE TABLE continentCodes (
154
155=== modified file 'sphinx.conf'
156--- sphinx.conf 2011-04-07 13:26:08 +0000
157+++ sphinx.conf 2012-08-23 04:40:25 +0000
158@@ -41,7 +41,7 @@
159 query_log = /var/log/sphinxsearch/query.log
160 read_timeout = 5
161 max_children = 30
162-pid_file = /var/run/searchd.pid
163+pid_file = /var/run/sphinxsearch/searchd.pid
164 max_matches = 1000
165 seamless_rotate = 1
166 preopen_indexes = 0

Subscribers

People subscribed via source and target branches

to all changes: