From 8cd5aa6baf2821eab4f41b1c2dcaede7935544f5 Mon Sep 17 00:00:00 2001 From: Sven Czarnian Date: Thu, 2 Sep 2021 09:01:16 +0200 Subject: [PATCH] introduce a tool that extracts the performance data out of Skybrary --- aman/tools/SkybraryAircraftCrawler.py | 151 ++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 aman/tools/SkybraryAircraftCrawler.py diff --git a/aman/tools/SkybraryAircraftCrawler.py b/aman/tools/SkybraryAircraftCrawler.py new file mode 100644 index 0000000..7c188fd --- /dev/null +++ b/aman/tools/SkybraryAircraftCrawler.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +import argparse +import configparser +import os +import urllib.request +from bs4 import BeautifulSoup +from aman.types.PerformanceData import PerformanceData + +def findAircraftPages(rooturl : str, suburl : str): + aircrafts = [] + + with urllib.request.urlopen(rooturl + suburl) as site: + data = site.read().decode('utf-8') + site.close() + + parsed = BeautifulSoup(data, features='lxml') + + for link in parsed.body.find_all('a', title=True): + split = link['href'].split('/') + if 3 == len(split) and split[2] == link['title'] and 'Category' not in link['title'] and 'Special' not in link['href']: + aircrafts.append(rooturl + link['href']) + + for link in parsed.body.find_all('a', attrs={ 'title': 'Category:Aircraft' }): + if 'previous' not in link.text: + aircrafts.extend(findAircraftPages(rooturl, link['href'])) + + return aircrafts + +def findAndParseEntry(tableRow, startIdx, substring, default): + while 0 < startIdx: + if substring in tableRow[startIdx].text: + split = tableRow[startIdx].text.split(' ') + if 1 >= len(split): + return default, startIdx - 2 + else: + return int(split[0]), startIdx - 2 + else: + startIdx -= 1 + + return 0, -1 + +def findAndParseSpeedEntry(tableRow, startIdx, default): + return findAndParseEntry(tableRow, startIdx, 'kts', default) + +def findAndParseRodEntry(tableRow, startIdx, default): + return findAndParseEntry(tableRow, startIdx, 'ft/min', default) + +def parsePerformanceEntries(tableRowSpeeds, tableRowRODs): + speeds = [] + rods = [] + + # parse the speed data + idx = len(tableRowSpeeds) - 1 + while 0 < idx: + parsed = findAndParseSpeedEntry(tableRowSpeeds, idx, 140 if 0 == len(speeds) else 250) + if 0 < idx: + speeds.append(parsed[0]) + idx = parsed[1] + + # parse the ROD data + idx = len(tableRowRODs) - 1 + while 0 < idx: + parsed = findAndParseRodEntry(tableRowRODs, idx, 2000) + if 0 < idx: + rods.append(parsed[0]) + idx = parsed[1] + + return speeds, rods + +def parsePerformanceData(url : str): + with urllib.request.urlopen(url) as site: + data = site.read().decode('utf-8') + site.close() + + # check if we find the ICAO code + parsed = BeautifulSoup(data, features='lxml') + icao = parsed.body.find('h5', attrs={ 'id' : 'siteSub', 'class' : 'subtitle'}) + if None == icao or '' == icao.text: + return False, None + + aircraft = PerformanceData(icao.text) + performanceTable = parsed.body.find('table', attrs={ 'class' : 'wikitable', 'style' : 'font-size: 90%;' }) + if None == performanceTable or None == performanceTable.find_all('tr')[1] or None == performanceTable.find_all('tr')[2]: + return False, None + + speeds, rods = parsePerformanceEntries(performanceTable.find_all('tr')[1].find_all('td'), + performanceTable.find_all('tr')[2].find_all('td')) + if 10 > len(speeds): + speeds.insert(1, speeds[1]) + + # create the speed data + if len(speeds) >= 4: + aircraft.speedApproach = speeds[0] + aircraft.speedBelowFL100 = speeds[1] + aircraft.speedAboveFL100 = speeds[2] + aircraft.speedAboveFL240 = speeds[3] + # create the ROD data + if len(rods) >= 3: + aircraft.rodBelowFL100 = rods[0] + aircraft.rodAboveFL100 = rods[1] + aircraft.rodAboveFL240 = rods[2] + + return len(speeds) >= 4 and len(rods) >= 3, aircraft + +if __name__ == '__main__': + # create the commandline parser + parser = argparse.ArgumentParser(description='Extract the aircraft performace data') + parser.add_argument('directory', help='Directory where to store the performance data configuration') + args = parser.parse_args() + + # create the directory if it does not exist + if not os.path.exists(args.directory): + os.makedirs(args.directory) + + # parse the aircrafts + links = findAircraftPages('https://www.skybrary.aero', '/index.php?title=Category:Aircraft') + print('Found ' + str(len(links)) + ' aircrafts') + + aircrafts = [] + parsed = 0 + for link in links: + valid, aircraft = parsePerformanceData(link) + + parsed += 1 + print('Parsed ' + str(parsed) + ' of ' + str(len(links)), end='\r') + + if False == valid: + print('Unable to find performance data for ' + link) + continue + + aircrafts.append(aircraft) + + print('Successfully parsed ' + str(len(aircrafts)) + ' of ' + str(len(links)) + ' aircrafts') + + # create the configuration file + config = configparser.ConfigParser() + for aircraft in aircrafts: + config[aircraft.icao] = { + 'speedAboveFL240' : aircraft.speedAboveFL240, + 'rodAboveFL240' : aircraft.rodAboveFL240, + 'speedAboveFL100' : aircraft.speedAboveFL100, + 'rodAboveFL100' : aircraft.rodAboveFL100, + 'speedBelowFL100' : aircraft.speedBelowFL100, + 'rodBelowFL100' : aircraft.rodBelowFL100, + 'speedApproach' : aircraft.speedApproach + } + + # write the configuration data + with open(args.directory + '/PerformanceData.ini', 'w') as file: + config.write(file)