Browse Source

introduce a tool that extracts the performance data out of Skybrary

Sven Czarnian 3 years ago
parent
commit
8cd5aa6baf
1 changed files with 151 additions and 0 deletions
  1. 151 0
      aman/tools/SkybraryAircraftCrawler.py

+ 151 - 0
aman/tools/SkybraryAircraftCrawler.py

@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+import argparse
+import configparser
+import os
+import urllib.request
+from bs4 import BeautifulSoup
+from aman.types.PerformanceData import PerformanceData
+
+def findAircraftPages(rooturl : str, suburl : str):
+    aircrafts = []
+
+    with urllib.request.urlopen(rooturl + suburl) as site:
+        data = site.read().decode('utf-8')
+        site.close()
+
+        parsed = BeautifulSoup(data, features='lxml')
+
+        for link in parsed.body.find_all('a', title=True):
+            split = link['href'].split('/')
+            if 3 == len(split) and split[2] == link['title'] and 'Category' not in link['title'] and 'Special' not in link['href']:
+                aircrafts.append(rooturl + link['href'])
+
+        for link in parsed.body.find_all('a', attrs={ 'title': 'Category:Aircraft' }):
+            if 'previous' not in link.text:
+                aircrafts.extend(findAircraftPages(rooturl, link['href']))
+
+    return aircrafts
+
+def findAndParseEntry(tableRow, startIdx, substring, default):
+    while 0 < startIdx:
+        if substring in tableRow[startIdx].text:
+            split = tableRow[startIdx].text.split(' ')
+            if 1 >= len(split):
+                return default, startIdx - 2
+            else:
+                return int(split[0]), startIdx - 2
+        else:
+            startIdx -= 1
+
+    return 0, -1
+
+def findAndParseSpeedEntry(tableRow, startIdx, default):
+    return findAndParseEntry(tableRow, startIdx, 'kts', default)
+
+def findAndParseRodEntry(tableRow, startIdx, default):
+    return findAndParseEntry(tableRow, startIdx, 'ft/min', default)
+
+def parsePerformanceEntries(tableRowSpeeds, tableRowRODs):
+    speeds = []
+    rods = []
+
+    # parse the speed data
+    idx = len(tableRowSpeeds) - 1
+    while 0 < idx:
+        parsed = findAndParseSpeedEntry(tableRowSpeeds, idx, 140 if 0 == len(speeds) else 250)
+        if 0 < idx:
+            speeds.append(parsed[0])
+            idx = parsed[1]
+
+    # parse the ROD data
+    idx = len(tableRowRODs) - 1
+    while 0 < idx:
+        parsed = findAndParseRodEntry(tableRowRODs, idx, 2000)
+        if 0 < idx:
+            rods.append(parsed[0])
+            idx = parsed[1]
+
+    return speeds, rods
+
+def parsePerformanceData(url : str):
+    with urllib.request.urlopen(url) as site:
+        data = site.read().decode('utf-8')
+        site.close()
+
+        # check if we find the ICAO code
+        parsed = BeautifulSoup(data, features='lxml')
+        icao = parsed.body.find('h5', attrs={ 'id' : 'siteSub', 'class' : 'subtitle'})
+        if None == icao or '' == icao.text:
+            return False, None
+
+        aircraft = PerformanceData(icao.text)
+        performanceTable = parsed.body.find('table', attrs={ 'class' : 'wikitable', 'style' : 'font-size: 90%;' })
+        if None == performanceTable or None == performanceTable.find_all('tr')[1] or None == performanceTable.find_all('tr')[2]:
+            return False, None
+
+        speeds, rods = parsePerformanceEntries(performanceTable.find_all('tr')[1].find_all('td'),
+                                               performanceTable.find_all('tr')[2].find_all('td'))
+        if 10 > len(speeds):
+            speeds.insert(1, speeds[1])
+
+        # create the speed data
+        if len(speeds) >= 4:
+            aircraft.speedApproach = speeds[0]
+            aircraft.speedBelowFL100 = speeds[1]
+            aircraft.speedAboveFL100 = speeds[2]
+            aircraft.speedAboveFL240 = speeds[3]
+        # create the ROD data
+        if len(rods) >= 3:
+            aircraft.rodBelowFL100 = rods[0]
+            aircraft.rodAboveFL100 = rods[1]
+            aircraft.rodAboveFL240 = rods[2]
+
+        return len(speeds) >= 4 and len(rods) >= 3, aircraft
+
+if __name__ == '__main__':
+    # create the commandline parser
+    parser = argparse.ArgumentParser(description='Extract the aircraft performace data')
+    parser.add_argument('directory', help='Directory where to store the performance data configuration')
+    args = parser.parse_args()
+
+    # create the directory if it does not exist
+    if not os.path.exists(args.directory):
+        os.makedirs(args.directory)
+
+    # parse the aircrafts
+    links = findAircraftPages('https://www.skybrary.aero', '/index.php?title=Category:Aircraft')
+    print('Found ' + str(len(links)) + ' aircrafts')
+
+    aircrafts = []
+    parsed = 0
+    for link in links:
+        valid, aircraft = parsePerformanceData(link)
+
+        parsed += 1
+        print('Parsed ' + str(parsed) + ' of ' + str(len(links)), end='\r')
+
+        if False == valid:
+            print('Unable to find performance data for ' + link)
+            continue
+
+        aircrafts.append(aircraft)
+
+    print('Successfully parsed ' + str(len(aircrafts)) + ' of ' + str(len(links)) + ' aircrafts')
+
+    # create the configuration file
+    config = configparser.ConfigParser()
+    for aircraft in aircrafts:
+        config[aircraft.icao] = {
+            'speedAboveFL240' : aircraft.speedAboveFL240,
+            'rodAboveFL240' : aircraft.rodAboveFL240,
+            'speedAboveFL100' : aircraft.speedAboveFL100,
+            'rodAboveFL100' : aircraft.rodAboveFL100,
+            'speedBelowFL100' : aircraft.speedBelowFL100,
+            'rodBelowFL100' : aircraft.rodBelowFL100,
+            'speedApproach' : aircraft.speedApproach
+        }
+
+    # write the configuration data
+    with open(args.directory + '/PerformanceData.ini', 'w') as file:
+        config.write(file)