introduce a tool that extracts the performance data out of Skybrary
This commit is contained in:
		
							
								
								
									
										151
									
								
								aman/tools/SkybraryAircraftCrawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								aman/tools/SkybraryAircraftCrawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,151 @@ | ||||
| #!/usr/bin/env python | ||||
|  | ||||
| import argparse | ||||
| import configparser | ||||
| import os | ||||
| import urllib.request | ||||
| from bs4 import BeautifulSoup | ||||
| from aman.types.PerformanceData import PerformanceData | ||||
|  | ||||
| def findAircraftPages(rooturl : str, suburl : str): | ||||
|     aircrafts = [] | ||||
|  | ||||
|     with urllib.request.urlopen(rooturl + suburl) as site: | ||||
|         data = site.read().decode('utf-8') | ||||
|         site.close() | ||||
|  | ||||
|         parsed = BeautifulSoup(data, features='lxml') | ||||
|  | ||||
|         for link in parsed.body.find_all('a', title=True): | ||||
|             split = link['href'].split('/') | ||||
|             if 3 == len(split) and split[2] == link['title'] and 'Category' not in link['title'] and 'Special' not in link['href']: | ||||
|                 aircrafts.append(rooturl + link['href']) | ||||
|  | ||||
|         for link in parsed.body.find_all('a', attrs={ 'title': 'Category:Aircraft' }): | ||||
|             if 'previous' not in link.text: | ||||
|                 aircrafts.extend(findAircraftPages(rooturl, link['href'])) | ||||
|  | ||||
|     return aircrafts | ||||
|  | ||||
| def findAndParseEntry(tableRow, startIdx, substring, default): | ||||
|     while 0 < startIdx: | ||||
|         if substring in tableRow[startIdx].text: | ||||
|             split = tableRow[startIdx].text.split(' ') | ||||
|             if 1 >= len(split): | ||||
|                 return default, startIdx - 2 | ||||
|             else: | ||||
|                 return int(split[0]), startIdx - 2 | ||||
|         else: | ||||
|             startIdx -= 1 | ||||
|  | ||||
|     return 0, -1 | ||||
|  | ||||
| def findAndParseSpeedEntry(tableRow, startIdx, default): | ||||
|     return findAndParseEntry(tableRow, startIdx, 'kts', default) | ||||
|  | ||||
| def findAndParseRodEntry(tableRow, startIdx, default): | ||||
|     return findAndParseEntry(tableRow, startIdx, 'ft/min', default) | ||||
|  | ||||
| def parsePerformanceEntries(tableRowSpeeds, tableRowRODs): | ||||
|     speeds = [] | ||||
|     rods = [] | ||||
|  | ||||
|     # parse the speed data | ||||
|     idx = len(tableRowSpeeds) - 1 | ||||
|     while 0 < idx: | ||||
|         parsed = findAndParseSpeedEntry(tableRowSpeeds, idx, 140 if 0 == len(speeds) else 250) | ||||
|         if 0 < idx: | ||||
|             speeds.append(parsed[0]) | ||||
|             idx = parsed[1] | ||||
|  | ||||
|     # parse the ROD data | ||||
|     idx = len(tableRowRODs) - 1 | ||||
|     while 0 < idx: | ||||
|         parsed = findAndParseRodEntry(tableRowRODs, idx, 2000) | ||||
|         if 0 < idx: | ||||
|             rods.append(parsed[0]) | ||||
|             idx = parsed[1] | ||||
|  | ||||
|     return speeds, rods | ||||
|  | ||||
| def parsePerformanceData(url : str): | ||||
|     with urllib.request.urlopen(url) as site: | ||||
|         data = site.read().decode('utf-8') | ||||
|         site.close() | ||||
|  | ||||
|         # check if we find the ICAO code | ||||
|         parsed = BeautifulSoup(data, features='lxml') | ||||
|         icao = parsed.body.find('h5', attrs={ 'id' : 'siteSub', 'class' : 'subtitle'}) | ||||
|         if None == icao or '' == icao.text: | ||||
|             return False, None | ||||
|  | ||||
|         aircraft = PerformanceData(icao.text) | ||||
|         performanceTable = parsed.body.find('table', attrs={ 'class' : 'wikitable', 'style' : 'font-size: 90%;' }) | ||||
|         if None == performanceTable or None == performanceTable.find_all('tr')[1] or None == performanceTable.find_all('tr')[2]: | ||||
|             return False, None | ||||
|  | ||||
|         speeds, rods = parsePerformanceEntries(performanceTable.find_all('tr')[1].find_all('td'), | ||||
|                                                performanceTable.find_all('tr')[2].find_all('td')) | ||||
|         if 10 > len(speeds): | ||||
|             speeds.insert(1, speeds[1]) | ||||
|  | ||||
|         # create the speed data | ||||
|         if len(speeds) >= 4: | ||||
|             aircraft.speedApproach = speeds[0] | ||||
|             aircraft.speedBelowFL100 = speeds[1] | ||||
|             aircraft.speedAboveFL100 = speeds[2] | ||||
|             aircraft.speedAboveFL240 = speeds[3] | ||||
|         # create the ROD data | ||||
|         if len(rods) >= 3: | ||||
|             aircraft.rodBelowFL100 = rods[0] | ||||
|             aircraft.rodAboveFL100 = rods[1] | ||||
|             aircraft.rodAboveFL240 = rods[2] | ||||
|  | ||||
|         return len(speeds) >= 4 and len(rods) >= 3, aircraft | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     # create the commandline parser | ||||
|     parser = argparse.ArgumentParser(description='Extract the aircraft performace data') | ||||
|     parser.add_argument('directory', help='Directory where to store the performance data configuration') | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # create the directory if it does not exist | ||||
|     if not os.path.exists(args.directory): | ||||
|         os.makedirs(args.directory) | ||||
|  | ||||
|     # parse the aircrafts | ||||
|     links = findAircraftPages('https://www.skybrary.aero', '/index.php?title=Category:Aircraft') | ||||
|     print('Found ' + str(len(links)) + ' aircrafts') | ||||
|  | ||||
|     aircrafts = [] | ||||
|     parsed = 0 | ||||
|     for link in links: | ||||
|         valid, aircraft = parsePerformanceData(link) | ||||
|  | ||||
|         parsed += 1 | ||||
|         print('Parsed ' + str(parsed) + ' of ' + str(len(links)), end='\r') | ||||
|  | ||||
|         if False == valid: | ||||
|             print('Unable to find performance data for ' + link) | ||||
|             continue | ||||
|  | ||||
|         aircrafts.append(aircraft) | ||||
|  | ||||
|     print('Successfully parsed ' + str(len(aircrafts)) + ' of ' + str(len(links)) + ' aircrafts') | ||||
|  | ||||
|     # create the configuration file | ||||
|     config = configparser.ConfigParser() | ||||
|     for aircraft in aircrafts: | ||||
|         config[aircraft.icao] = { | ||||
|             'speedAboveFL240' : aircraft.speedAboveFL240, | ||||
|             'rodAboveFL240' : aircraft.rodAboveFL240, | ||||
|             'speedAboveFL100' : aircraft.speedAboveFL100, | ||||
|             'rodAboveFL100' : aircraft.rodAboveFL100, | ||||
|             'speedBelowFL100' : aircraft.speedBelowFL100, | ||||
|             'rodBelowFL100' : aircraft.rodBelowFL100, | ||||
|             'speedApproach' : aircraft.speedApproach | ||||
|         } | ||||
|  | ||||
|     # write the configuration data | ||||
|     with open(args.directory + '/PerformanceData.ini', 'w') as file: | ||||
|         config.write(file) | ||||
		Reference in New Issue
	
	Block a user