DwdCrawler.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/env python
  2. import datetime
  3. import time
  4. import urllib.request
  5. from bs4 import BeautifulSoup
  6. from datetime import datetime as dt
  7. from threading import Thread
  8. # @brief Checks the DWD pages for wind information
  9. # Format:
  10. # Provides next update tine (updateTime) of the DWD page in UTC
  11. # Provides a list of wind information (windData)
  12. # - organized as a list of tuples
  13. # - first element of tuple: GAFOR-IDs for the following wind information
  14. # - second element of tuple: list of tuples of wind data
  15. # - first element of wind data tuple: minimum altitude AMSL for this wind information
  16. # - second element of wind data tuple: wind direction
  17. # - third element of wind data tuple: wind speed (KT)
  18. class DwdCrawler(Thread):
  19. def __init__(self):
  20. Thread.__init__(self)
  21. self.dataAvailable = False
  22. self.executing = True
  23. self.start()
  24. def parseGaforAreas(areas : str):
  25. areas = areas.replace(':', '')
  26. areas = areas.split(' ')[1]
  27. areaIds = []
  28. # some IDs are lists
  29. for segment in areas.split(','):
  30. # check if we have range definitions or single IDs
  31. borders = segment.split('-')
  32. if 2 == len(borders):
  33. areaIds.extend(range(int(borders[0]), int(borders[1]) + 1))
  34. else:
  35. areaIds.append(int(borders[0]))
  36. return areaIds
  37. def parseWindTableRow(row : str, table):
  38. # get the columns
  39. entries = row.split('|')
  40. # check if the line is invalid or we have the header
  41. if 2 > len(entries) or 'AMSL' in entries[0]:
  42. return table
  43. # parse the wind data
  44. windData = entries[1].strip().split(' ')[0].split('/')
  45. if 2 != len(windData):
  46. return table
  47. # extend the table
  48. altitude = entries[0].strip()
  49. if 'FL' in altitude:
  50. altitude = int(altitude.replace('FL', '')) * 100
  51. else:
  52. altitude = int(altitude.replace('FT', ''))
  53. row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) )
  54. table.append(row)
  55. return table
  56. def parseNextUpdateTime(line : str):
  57. entries = line.split(' ')
  58. if 4 <= len(entries):
  59. utcIndex = 2
  60. if 'UTC' in entries[len(entries) - 2]:
  61. utcIndex = len(entries) - 3
  62. elif 'UTC' in entries[len(entries) - 1]:
  63. utcIndex = len(entries - 2)
  64. currentUtc = dt.utcfromtimestamp(int(time.time()))
  65. currentHour = int(currentUtc.strftime('%H'))
  66. # check if we have a day overlap
  67. if currentHour > int(entries[utcIndex].split('.')[0]):
  68. nextDay = currentUtc + datetime.timedelta(days=1)
  69. date = nextDay.strftime('%Y-%m-%d')
  70. else:
  71. date = currentUtc.strftime('%Y-%m-%d')
  72. # create the new UTC update time
  73. return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z')
  74. def parseGaforPage(self, url : str):
  75. with urllib.request.urlopen(url) as site:
  76. data = site.read().decode('utf-8')
  77. site.close()
  78. parsed = BeautifulSoup(data, features='lxml')
  79. # search the info about the GAFOR areas
  80. content = None
  81. for element in parsed.body.find_all('pre'):
  82. content = element.text
  83. # analyze the received data
  84. if None != content:
  85. windInformation = []
  86. udpdateTime = None
  87. windTable = []
  88. areaIds = None
  89. # find all relevant information
  90. for line in content.splitlines():
  91. if '' == line:
  92. if 0 != len(windTable):
  93. windInformation.append(( areaIds, windTable ))
  94. areaIds = None
  95. windTable = []
  96. elif line.startswith('GAFOR-Gebiete'):
  97. areaIds = DwdCrawler.parseGaforAreas(line)
  98. windTable = []
  99. elif None != areaIds:
  100. windTable = DwdCrawler.parseWindTableRow(line, windTable)
  101. elif 'Aktualisierung erfolgt um ' in line:
  102. updateTime = DwdCrawler.parseNextUpdateTime(line)
  103. # return the collected information
  104. if 0 != len(windInformation) and None != updateTime:
  105. return updateTime, windInformation
  106. else:
  107. return None
  108. def run(self):
  109. with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site:
  110. data = site.read().decode('utf-8')
  111. site.close()
  112. # find the pages of the GAFOR reports
  113. pages = []
  114. parsed = BeautifulSoup(data, features='lxml')
  115. for link in parsed.body.find_all('a', title=True):
  116. if 'node' in link['href'] and 'Flugwetterprognose' in link['title']:
  117. # remove the jsession from the link
  118. pages.append('https://www.dwd.de/' + link['href'].split(';')[0])
  119. # receive the wind data
  120. self.updateTime = None
  121. self.windData = []
  122. for page in pages:
  123. next, wind = self.parseGaforPage(page)
  124. if None != next:
  125. if None == self.updateTime or self.updateTime > next:
  126. self.updateTime = next
  127. self.windData.extend(wind)
  128. # indicate that new wind data is available
  129. if None != self.updateTime:
  130. self.dataAvailable = True
  131. self.executing = False