data/ciasource/cia2kml.py

0001 #! /usr/bin/env python
0002 # -*- coding: iso-8859-1 -*-
0003 #
0004 # SPDX-License-Identifier: LGPL-2.1-or-later
0005 #
0006 # This python script parses the CIA Factbook 2007 for country data and
0007 # saves it in KML format
0008 # Place this script in the root directory of the CIA Factbook
0009 # (usually "factbook") and run it.
0010 #
0011 # SPDX-FileCopyrightText: 2004-2007 Torsten Rahn <tackat@kde.org>"
0012 #
0013
0014 import urllib, urllib2, re
0015
0016 class AppURLopener(urllib.FancyURLopener):
0017     version = "Mozilla/5.0"
0018 urllib._urlopener = AppURLopener()
0019
0020 # This method converts the representation of the coordinate values
0021 # on the CIA webpages to the representation we need in the KML file.
0022
0023 def convertCoordinates( ciaString ):
0024 #    print "cia: " + ciaString
0025
0026     commentStart=ciaString.find(";")
0027     coordinateString=ciaString
0028     if commentStart!=-1 :
0029         coordinateString = ciaString[ 0 : ciaString.find(";") ].strip()
0030
0031     prefixEnd=coordinateString.find("</i> ")
0032     if prefixEnd!=-1 :
0033         coordinateString = coordinateString[ prefixEnd + 5 : len(coordinateString) ].strip()
0034
0035     coordinateList = coordinateString.split(", ")
0036     latCoordinate=0.0
0037     lonCoordinate=0.0
0038     coordinateValue=0.0
0039
0040     for coordinate in coordinateList :
0041         coordinateComponents = coordinate.split(' ')
0042 #        print coordinateComponents[0] + " " + coordinateComponents[1]+ " " + coordinateComponents[2]
0043         coordinateValue = float(coordinateComponents[0]) + float(coordinateComponents[1])/60.0
0044         if coordinateComponents[2]=='W' or coordinateComponents[2]=='S' :
0045             coordinateValue=-coordinateValue
0046         if coordinate==coordinateList[0] :
0047             latCoordinate=coordinateValue
0048         else :
0049             lonCoordinate=coordinateValue
0050     latString="%04.3f" % latCoordinate
0051     lonString="%04.3f" % lonCoordinate
0052     coordinateString=lonString + "," + latString
0053     return coordinateString
0054
0055 # This method converts the representation of the population value
0056 # on the CIA webpages to the representation we need in the KML file.
0057 def convertPopulation( ciaString ):
0058 #    print ciaString
0059     if ciaString=="uninhabited" :
0060         return 0
0061     prefixEnd=ciaString.find("</i> ")
0062     if prefixEnd!=-1 :
0063         ciaString = ciaString[ prefixEnd + 5 : len(ciaString) ].strip()
0064     populationString=ciaString.split(' ')[0]
0065     if populationString=="no" :
0066         return 0
0067     populationString=populationString.replace(",", "")
0068     population=int(populationString)
0069     populationString="%s" % (populationString)
0070     return populationString
0071
0072 def convertArea( ciaString ):
0073 #    print ciaString
0074 #    print "..."
0075     prefixEnd=ciaString.find("</i> ")
0076     if prefixEnd!=-1 :
0077         ciaString = ciaString[ prefixEnd + 5 : len(ciaString) ].strip()
0078     prefixEnd=ciaString.find("(")
0079     if prefixEnd!=-1 :
0080         ciaString = ciaString[ 0 : prefixEnd ].strip()
0081     prefixEnd=ciaString.find(";")
0082     if prefixEnd!=-1 :
0083         ciaString = ciaString[ prefixEnd + 1 : len(ciaString) ].strip()
0084     prefixEnd=ciaString.find("land -")
0085     if prefixEnd!=-1 :
0086         ciaString = ciaString[ prefixEnd + 6 : len(ciaString) ].strip()
0087
0088     areaString=ciaString.split(' ')[0]
0089 #    print areaString
0090     areaString=areaString.replace(",", "")
0091     area=float(areaString)
0092     areaString="%02.1f" % area
0093 #    print areaString
0094     return areaString
0095
0096
0097 # First build two hashes that map FIPS10 to ISO country codes
0098 # and FIPS10 to country names. As a source for data we use
0099 # the appendix-d.html of the CIA Factbook.
0100
0101 countryCodeSource = urllib.urlopen("./appendix/appendix-d.html").readlines()
0102
0103 startTag="<td width=\"15%\" align=\"left\" valign=top class=\"Normal\"><b>"
0104 stopTag="</b></td>"
0105 finishTag="</tr>"
0106 subStartTag="<td width=\"10%\" valign=top class=\"Normal\">"
0107 subStopTag="</td>"
0108 fipsString=""
0109 country=""
0110
0111 inside=0
0112 subcount=0
0113
0114 countryDict={}
0115 countryCodeDict={}
0116
0117 for line in countryCodeSource:
0118     result = re.match ( '(.*)'+ startTag +'(.*)' + stopTag, line )
0119     if result > -1 :
0120         country = line[ line.find(startTag) + len(startTag) : line.find(stopTag) ].strip()
0121         country = country.split(',')[0]
0122         inside=1
0123     else :
0124         if inside==1 :
0125             result = re.match ( '(.*)'+ subStartTag +'(.*)' + subStopTag, line )
0126             if result > -1 :
0127                 sub = line[ line.find(subStartTag) + len(subStartTag) : line.find(subStopTag) ].strip()
0128                 subcount+=1
0129                 if subcount == 1 :
0130                     fipsString=sub
0131                 if subcount == 2 :
0132                     if fipsString.strip()!="-" and sub.strip()!="-" :
0133                         countryCodeDict[fipsString]=sub
0134                         countryDict[fipsString]=country
0135             result = re.match ( '(.*)'+ finishTag +'(.*)', line )
0136             if result > -1 :
0137                 inside = 0
0138                 subcount = 0
0139
0140 # Now that the hashes are in place we go through each
0141 # website named after the fips10 code to gather the data
0142 # we need for the KML file.
0143
0144 startTopicTag="<div align=\"right\">"
0145 stopTopicTag="</div>"
0146 topicTagList=["Background:", "Geographic coordinates:", "Area:", "Population:"]
0147 topicBackgroundString=""
0148 topicCoordinatesString=""
0149 topicAreaString=""
0150 topicPopulationString=""
0151 placemarkFolder=""
0152 valueTag="<br> "
0153 finishTag="</table>"
0154 inside=0
0155 subcount=-1
0156
0157 print "Parsing ..."
0158
0159 for item in countryCodeDict:
0160     print item
0161 #    print item + " " + countryCodeDict.get( item, "-" ) + " " + countryDict.get( item, "-" )
0162     fileUrlString="./geos/%s.html" % (item.lower())
0163 #    print fileUrlString
0164     try:
0165         handle = urllib.urlopen(fileUrlString)
0166     except IOError:
0167         print "Error: No file for " + countryDict.get( item, "-" )
0168         continue
0169     countrySource = handle.readlines()
0170     for topicTag in topicTagList:
0171         for line in countrySource:
0172             result = re.match ( '(.*)'+ startTopicTag + topicTag + stopTopicTag +'(.*)', line )
0173             if result > -1 :
0174 #                print line
0175                 inside=1
0176                 subcount=-1
0177             else :
0178                 if inside==1 :
0179                     result = re.match ( '(.*)'+ valueTag, line )
0180                     if result > -1 :
0181 #                       print line
0182                         subcount=0
0183                     if subcount>1 and line.strip()!="" :
0184                         if topicTag=="Background:" :
0185                             topicBackgroundString=line.strip()
0186                         if topicTag=="Geographic coordinates:" :
0187                             topicCoordinatesString=line.strip()
0188                         if topicTag=="Area:" :
0189                             topicAreaString=line.strip()
0190                         if topicTag=="Population:" :
0191                             topicPopulationString=line.strip()
0192                         inside=0
0193                         subcount=-1
0194                         break
0195                     if subcount>-1 :
0196                         subcount+=1
0197
0198     topicCoordinatesString=convertCoordinates( topicCoordinatesString )
0199     topicAreaString=convertArea( topicAreaString )
0200     topicPopulationString=convertPopulation( topicPopulationString )
0201     countryNameString=countryDict.get( item, "-" )
0202     countryCodeString=countryCodeDict.get( item, "-" )
0203
0204     topicBackgroundString=topicBackgroundString + "  <i>Source: CIA - The World Factbook 2007</i> "
0205
0206 # Now that we have all the data needed we create a KML snippet that
0207 # represents the country data and add it to the previous ones that
0208 # we've generated already.
0209
0210     placemarkString='''
0211     <Placemark>
0212         <name>%s</name>
0213         <description>%s</description>
0214         <countrycode>%s</countrycode>
0215         <role>S</role>
0216         <area>%s</area>
0217         <pop>%s</pop>
0218         <Point>
0219             <coordinates>%s</coordinates>
0220         </Point>
0221     </Placemark>''' %( countryNameString, topicBackgroundString, countryCodeString, topicAreaString, topicPopulationString, topicCoordinatesString )
0222     placemarkFolder += placemarkString
0223
0224 # Now we insert all our KML code for the countries into a KML
0225 # document
0226
0227 kmlDocument = '''<?xml version="1.0" encoding="UTF-8"?>
0228 <kml xmlns="http://earth.google.com/kml/2.0">
0229 <Document>
0230     <SimpleField>
0231         <name>pop</name>
0232         <type>int</type>
0233     </SimpleField>
0234     <SimpleField>
0235         <name>area</name>
0236         <type>int</type>
0237     </SimpleField>
0238     <SimpleField>
0239         <name>state</name>
0240         <type>string</type>
0241     </SimpleField>
0242     <SimpleField>
0243         <name>countrycode</name>
0244         <type>string</type>
0245     </SimpleField>
0246     <SimpleField>
0247         <name>role</name>
0248         <type>string</type>
0249     </SimpleField>
0250 %s
0251 </Document>
0252 </kml>''' %( placemarkFolder)
0253
0254 # Finally we save the KML document we have created
0255
0256 out_file = open('ciacountries.kml','w')
0257 out_file.write(kmlDocument)
0258 out_file.close()
0259