diff --git a/aman/com/DwdCrawler.py b/aman/com/DwdCrawler.py new file mode 100644 index 0000000..06b9818 --- /dev/null +++ b/aman/com/DwdCrawler.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python + +import datetime +import time +import urllib.request + +from bs4 import BeautifulSoup +from datetime import datetime as dt +from threading import Thread + +# @brief Checks the DWD pages for wind information +# Format: +# Provides next update tine (updateTime) of the DWD page in UTC +# Provides a list of wind information (windData) +# - organized as a list of tuples +# - first element of tuple: GAFOR-IDs for the following wind information +# - second element of tuple: list of tuples of wind data +# - first element of wind data tuple: minimum altitude AMSL for this wind information +# - second element of wind data tuple: wind direction +# - third element of wind data tuple: wind speed (KT) +class DwdCrawler(Thread): + def __init__(self): + Thread.__init__(self) + self.dataAvailable = False + self.executing = True + self.start() + + def parseGaforAreas(areas : str): + areas = areas.replace(':', '') + areas = areas.split(' ')[1] + areaIds = [] + + # some IDs are lists + for segment in areas.split(','): + # check if we have range definitions or single IDs + borders = segment.split('-') + if 2 == len(borders): + areaIds.extend(range(int(borders[0]), int(borders[1]) + 1)) + else: + areaIds.append(int(borders[0])) + + return areaIds + + def parseWindTableRow(row : str, table): + # get the columns + entries = row.split('|') + + # check if the line is invalid or we have the header + if 2 > len(entries) or 'AMSL' in entries[0]: + return table + + # parse the wind data + windData = entries[1].strip().split(' ')[0].split('/') + if 2 != len(windData): + return table + + # extend the table + altitude = entries[0].strip() + if 'FL' in altitude: + altitude = int(altitude.replace('FL', '')) * 100 + else: + altitude = int(altitude.replace('FT', '')) + row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) ) + table.append(row) + + return table + + def parseNextUpdateTime(line : str): + entries = line.split(' ') + if 4 <= len(entries): + utcIndex = 2 + if 'UTC' in entries[len(entries) - 2]: + utcIndex = len(entries) - 3 + elif 'UTC' in entries[len(entries) - 1]: + utcIndex = len(entries - 2) + + currentUtc = dt.utcfromtimestamp(int(time.time())) + currentHour = int(currentUtc.strftime('%H')) + + # check if we have a day overlap + if currentHour > int(entries[utcIndex].split('.')[0]): + nextDay = currentUtc + datetime.timedelta(days=1) + date = nextDay.strftime('%Y-%m-%d') + else: + date = currentUtc.strftime('%Y-%m-%d') + + # create the new UTC update time + return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z') + + def parseGaforPage(self, url : str): + with urllib.request.urlopen(url) as site: + data = site.read().decode('utf-8') + site.close() + + parsed = BeautifulSoup(data, features='lxml') + + # search the info about the GAFOR areas + content = None + for element in parsed.body.find_all('pre'): + content = element.text + + # analyze the received data + if None != content: + windInformation = [] + udpdateTime = None + windTable = [] + areaIds = None + + # find all relevant information + for line in content.splitlines(): + if '' == line: + if 0 != len(windTable): + windInformation.append(( areaIds, windTable )) + areaIds = None + windTable = [] + elif line.startswith('GAFOR-Gebiete'): + areaIds = DwdCrawler.parseGaforAreas(line) + windTable = [] + elif None != areaIds: + windTable = DwdCrawler.parseWindTableRow(line, windTable) + elif 'Aktualisierung erfolgt um ' in line: + updateTime = DwdCrawler.parseNextUpdateTime(line) + + # return the collected information + if 0 != len(windInformation) and None != updateTime: + return updateTime, windInformation + else: + return None + + def run(self): + with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site: + data = site.read().decode('utf-8') + site.close() + + # find the pages of the GAFOR reports + pages = [] + parsed = BeautifulSoup(data, features='lxml') + for link in parsed.body.find_all('a', title=True): + if 'node' in link['href'] and 'Flugwetterprognose' in link['title']: + # remove the jsession from the link + pages.append('https://www.dwd.de/' + link['href'].split(';')[0]) + + # receive the wind data + self.updateTime = None + self.windData = [] + for page in pages: + next, wind = self.parseGaforPage(page) + if None != next: + if None == self.updateTime or self.updateTime > next: + self.updateTime = next + self.windData.extend(wind) + + # indicate that new wind data is available + if None != self.updateTime: + self.dataAvailable = True + + self.executing = False