Files
aman-sys/aman/tools/SkybraryAircraftCrawler.py

152 lines
5.4 KiB
Python

#!/usr/bin/env python
import argparse
import configparser
import os
import urllib.request
from bs4 import BeautifulSoup
from aman.types.PerformanceData import PerformanceData
def findAircraftPages(rooturl : str, suburl : str):
aircrafts = []
with urllib.request.urlopen(rooturl + suburl) as site:
data = site.read().decode('utf-8')
site.close()
parsed = BeautifulSoup(data, features='lxml')
for link in parsed.body.find_all('a', title=True):
split = link['href'].split('/')
if 3 == len(split) and split[2] == link['title'] and 'Category' not in link['title'] and 'Special' not in link['href']:
aircrafts.append(rooturl + link['href'])
for link in parsed.body.find_all('a', attrs={ 'title': 'Category:Aircraft' }):
if 'previous' not in link.text:
aircrafts.extend(findAircraftPages(rooturl, link['href']))
return aircrafts
def findAndParseEntry(tableRow, startIdx, substring, default):
while 0 < startIdx:
if substring in tableRow[startIdx].text:
split = tableRow[startIdx].text.split(' ')
if 1 >= len(split):
return default, startIdx - 2
else:
return int(split[0]), startIdx - 2
else:
startIdx -= 1
return 0, -1
def findAndParseSpeedEntry(tableRow, startIdx, default):
return findAndParseEntry(tableRow, startIdx, 'kts', default)
def findAndParseRodEntry(tableRow, startIdx, default):
return findAndParseEntry(tableRow, startIdx, 'ft/min', default)
def parsePerformanceEntries(tableRowSpeeds, tableRowRODs):
speeds = []
rods = []
# parse the speed data
idx = len(tableRowSpeeds) - 1
while 0 < idx:
parsed = findAndParseSpeedEntry(tableRowSpeeds, idx, 140 if 0 == len(speeds) else 250)
if 0 < idx:
speeds.append(parsed[0])
idx = parsed[1]
# parse the ROD data
idx = len(tableRowRODs) - 1
while 0 < idx:
parsed = findAndParseRodEntry(tableRowRODs, idx, 2000)
if 0 < idx:
rods.append(parsed[0])
idx = parsed[1]
return speeds, rods
def parsePerformanceData(url : str):
with urllib.request.urlopen(url) as site:
data = site.read().decode('utf-8')
site.close()
# check if we find the ICAO code
parsed = BeautifulSoup(data, features='lxml')
icao = parsed.body.find('h5', attrs={ 'id' : 'siteSub', 'class' : 'subtitle'})
if None == icao or '' == icao.text:
return False, None
aircraft = PerformanceData(icao.text)
performanceTable = parsed.body.find('table', attrs={ 'class' : 'wikitable', 'style' : 'font-size: 90%;' })
if None == performanceTable or None == performanceTable.find_all('tr')[1] or None == performanceTable.find_all('tr')[2]:
return False, None
speeds, rods = parsePerformanceEntries(performanceTable.find_all('tr')[1].find_all('td'),
performanceTable.find_all('tr')[2].find_all('td'))
if 10 > len(speeds):
speeds.insert(1, speeds[1])
# create the speed data
if len(speeds) >= 4:
aircraft.speedApproach = speeds[0]
aircraft.speedBelowFL100 = speeds[1]
aircraft.speedAboveFL100 = speeds[2]
aircraft.speedAboveFL240 = speeds[3]
# create the ROD data
if len(rods) >= 3:
aircraft.rodBelowFL100 = rods[0]
aircraft.rodAboveFL100 = rods[1]
aircraft.rodAboveFL240 = rods[2]
return len(speeds) >= 4 and len(rods) >= 3, aircraft
if __name__ == '__main__':
# create the commandline parser
parser = argparse.ArgumentParser(description='Extract the aircraft performace data')
parser.add_argument('directory', help='Directory where to store the performance data configuration')
args = parser.parse_args()
# create the directory if it does not exist
if not os.path.exists(args.directory):
os.makedirs(args.directory)
# parse the aircrafts
links = findAircraftPages('https://www.skybrary.aero', '/index.php?title=Category:Aircraft')
print('Found ' + str(len(links)) + ' aircrafts')
aircrafts = []
parsed = 0
for link in links:
valid, aircraft = parsePerformanceData(link)
parsed += 1
print('Parsed ' + str(parsed) + ' of ' + str(len(links)), end='\r')
if False == valid:
print('Unable to find performance data for ' + link)
continue
aircrafts.append(aircraft)
print('Successfully parsed ' + str(len(aircrafts)) + ' of ' + str(len(links)) + ' aircrafts')
# create the configuration file
config = configparser.ConfigParser()
for aircraft in aircrafts:
config[aircraft.icao] = {
'speedAboveFL240' : aircraft.speedAboveFL240,
'rodAboveFL240' : aircraft.rodAboveFL240,
'speedAboveFL100' : aircraft.speedAboveFL100,
'rodAboveFL100' : aircraft.rodAboveFL100,
'speedBelowFL100' : aircraft.speedBelowFL100,
'rodBelowFL100' : aircraft.rodBelowFL100,
'speedApproach' : aircraft.speedApproach
}
# write the configuration data
with open(args.directory + '/PerformanceData.ini', 'w') as file:
config.write(file)