SkybraryAircraftCrawler.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #!/usr/bin/env python
  2. import argparse
  3. import configparser
  4. import os
  5. import urllib.request
  6. from bs4 import BeautifulSoup
  7. from aman.types.PerformanceData import PerformanceData
  8. def findAircraftPages(rooturl : str, suburl : str):
  9. aircrafts = []
  10. with urllib.request.urlopen(rooturl + suburl) as site:
  11. data = site.read().decode('utf-8')
  12. site.close()
  13. parsed = BeautifulSoup(data, features='lxml')
  14. for link in parsed.body.find_all('a', title=True):
  15. split = link['href'].split('/')
  16. if 3 == len(split) and split[2] == link['title'] and 'Category' not in link['title'] and 'Special' not in link['href']:
  17. aircrafts.append(rooturl + link['href'])
  18. for link in parsed.body.find_all('a', attrs={ 'title': 'Category:Aircraft' }):
  19. if 'previous' not in link.text:
  20. aircrafts.extend(findAircraftPages(rooturl, link['href']))
  21. return aircrafts
  22. def findAndParseEntry(tableRow, startIdx, substring, default):
  23. while 0 < startIdx:
  24. if substring in tableRow[startIdx].text:
  25. split = tableRow[startIdx].text.split(' ')
  26. if 1 >= len(split):
  27. return default, startIdx - 2
  28. else:
  29. return int(split[0]), startIdx - 2
  30. else:
  31. startIdx -= 1
  32. return 0, -1
  33. def findAndParseSpeedEntry(tableRow, startIdx, default):
  34. return findAndParseEntry(tableRow, startIdx, 'kts', default)
  35. def findAndParseRodEntry(tableRow, startIdx, default):
  36. return findAndParseEntry(tableRow, startIdx, 'ft/min', default)
  37. def parsePerformanceEntries(tableRowSpeeds, tableRowRODs):
  38. speeds = []
  39. rods = []
  40. # parse the speed data
  41. idx = len(tableRowSpeeds) - 1
  42. while 0 < idx:
  43. parsed = findAndParseSpeedEntry(tableRowSpeeds, idx, 140 if 0 == len(speeds) else 250)
  44. if 0 < idx:
  45. speeds.append(parsed[0])
  46. idx = parsed[1]
  47. # parse the ROD data
  48. idx = len(tableRowRODs) - 1
  49. while 0 < idx:
  50. parsed = findAndParseRodEntry(tableRowRODs, idx, 2000)
  51. if 0 < idx:
  52. rods.append(parsed[0])
  53. idx = parsed[1]
  54. return speeds, rods
  55. def parsePerformanceData(url : str):
  56. with urllib.request.urlopen(url) as site:
  57. data = site.read().decode('utf-8')
  58. site.close()
  59. # check if we find the ICAO code
  60. parsed = BeautifulSoup(data, features='lxml')
  61. icao = parsed.body.find('h5', attrs={ 'id' : 'siteSub', 'class' : 'subtitle'})
  62. if None == icao or '' == icao.text:
  63. return False, None
  64. aircraft = PerformanceData(icao.text)
  65. performanceTable = parsed.body.find('table', attrs={ 'class' : 'wikitable', 'style' : 'font-size: 90%;' })
  66. if None == performanceTable or None == performanceTable.find_all('tr')[1] or None == performanceTable.find_all('tr')[2]:
  67. return False, None
  68. speeds, rods = parsePerformanceEntries(performanceTable.find_all('tr')[1].find_all('td'),
  69. performanceTable.find_all('tr')[2].find_all('td'))
  70. if 10 > len(speeds):
  71. speeds.insert(1, speeds[1])
  72. # create the speed data
  73. if len(speeds) >= 4:
  74. aircraft.speedApproach = speeds[0]
  75. aircraft.speedBelowFL100 = speeds[1]
  76. aircraft.speedAboveFL100 = speeds[2]
  77. aircraft.speedAboveFL240 = speeds[3]
  78. # create the ROD data
  79. if len(rods) >= 3:
  80. aircraft.rodBelowFL100 = rods[0]
  81. aircraft.rodAboveFL100 = rods[1]
  82. aircraft.rodAboveFL240 = rods[2]
  83. return len(speeds) >= 4 and len(rods) >= 3, aircraft
  84. if __name__ == '__main__':
  85. # create the commandline parser
  86. parser = argparse.ArgumentParser(description='Extract the aircraft performace data')
  87. parser.add_argument('directory', help='Directory where to store the performance data configuration')
  88. args = parser.parse_args()
  89. # create the directory if it does not exist
  90. if not os.path.exists(args.directory):
  91. os.makedirs(args.directory)
  92. # parse the aircrafts
  93. links = findAircraftPages('https://www.skybrary.aero', '/index.php?title=Category:Aircraft')
  94. print('Found ' + str(len(links)) + ' aircrafts')
  95. aircrafts = []
  96. parsed = 0
  97. for link in links:
  98. valid, aircraft = parsePerformanceData(link)
  99. parsed += 1
  100. print('Parsed ' + str(parsed) + ' of ' + str(len(links)), end='\r')
  101. if False == valid:
  102. print('Unable to find performance data for ' + link)
  103. continue
  104. aircrafts.append(aircraft)
  105. print('Successfully parsed ' + str(len(aircrafts)) + ' of ' + str(len(links)) + ' aircrafts')
  106. # create the configuration file
  107. config = configparser.ConfigParser()
  108. for aircraft in aircrafts:
  109. config[aircraft.icao] = {
  110. 'speedAboveFL240' : aircraft.speedAboveFL240,
  111. 'rodAboveFL240' : aircraft.rodAboveFL240,
  112. 'speedAboveFL100' : aircraft.speedAboveFL100,
  113. 'rodAboveFL100' : aircraft.rodAboveFL100,
  114. 'speedBelowFL100' : aircraft.speedBelowFL100,
  115. 'rodBelowFL100' : aircraft.rodBelowFL100,
  116. 'speedApproach' : aircraft.speedApproach
  117. }
  118. # write the configuration data
  119. with open(args.directory + '/PerformanceData.ini', 'w') as file:
  120. config.write(file)