Files
aman-sys/aman/com/DwdCrawler.py
2021-11-13 09:45:17 +01:00

163 baris
6.1 KiB
Python

#!/usr/bin/env python
import datetime
import time
import urllib.request
from bs4 import BeautifulSoup
from datetime import datetime as dt
# @brief Checks the DWD pages for wind information
# Format:
# Provides next update tine (updateTime) of the DWD page in UTC
# Provides a list of wind information (windData)
# - organized as a list of tuples
# - first element of tuple: GAFOR-IDs for the following wind information
# - second element of tuple: list of tuples of wind data
# - first element of wind data tuple: minimum altitude AMSL for this wind information
# - second element of wind data tuple: wind direction
# - third element of wind data tuple: wind speed (KT)
class DwdCrawler():
def __init__(self):
self.UpdateTime = None
self.WindData = None
def parseGaforAreas(areas : str):
areas = areas.replace(':', '')
areas = areas.split(' ')[1]
areaIds = []
# some IDs are lists
for segment in areas.split(','):
# check if we have range definitions or single IDs
borders = segment.split('-')
if 2 == len(borders):
areaIds.extend(range(int(borders[0]), int(borders[1]) + 1))
else:
areaIds.append(int(borders[0]))
return areaIds
def parseWindTableRow(row : str, table):
# get the columns
entries = row.split('|')
# check if the line is invalid or we have the header
if 2 > len(entries) or 'AMSL' in entries[0]:
return table
# parse the wind data
windData = entries[1].strip().split(' ')[0].split('/')
if 2 != len(windData):
return table
# extend the table
altitude = entries[0].strip()
if 'FL' in altitude:
altitude = int(altitude.replace('FL', '')) * 100
else:
altitude = int(altitude.replace('FT', ''))
if 'VRB' == windData[0]:
row = ( altitude, 0, int(windData[1].replace('KT', '')) )
else:
row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) )
table.append(row)
return table
def parseNextUpdateTime(line : str):
entries = line.split(' ')
if 4 <= len(entries):
utcIndex = 2
if 'UTC' in entries[len(entries) - 2]:
utcIndex = len(entries) - 3
elif 'UTC' in entries[len(entries) - 1]:
utcIndex = len(entries - 2)
currentUtc = dt.utcfromtimestamp(int(time.time()))
currentHour = int(currentUtc.strftime('%H'))
# check if we have a day overlap
if currentHour > int(entries[utcIndex].split('.')[0]):
nextDay = currentUtc + datetime.timedelta(days=1)
date = nextDay.strftime('%Y-%m-%d')
else:
date = currentUtc.strftime('%Y-%m-%d')
# create the new UTC update time
return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z')
def parseGaforPage(self, url : str):
with urllib.request.urlopen(url) as site:
data = site.read().decode('utf-8')
site.close()
parsed = BeautifulSoup(data, features='lxml')
# search the info about the GAFOR areas
content = None
for element in parsed.body.find_all('pre'):
content = element.text
# analyze the received data
if None != content:
windInformation = []
nextUpdate = None
windTable = []
areaIds = None
# find all relevant information
for line in content.splitlines():
if '' == line:
if 0 != len(windTable):
for id in areaIds:
windInformation.append([ id, windTable ])
areaIds = None
windTable = []
elif line.startswith('GAFOR-Gebiete'):
areaIds = DwdCrawler.parseGaforAreas(line)
windTable = []
elif None != areaIds:
windTable = DwdCrawler.parseWindTableRow(line, windTable)
elif 'Aktualisierung erfolgt um ' in line:
nextUpdate = DwdCrawler.parseNextUpdateTime(line)
# return the collected information
if 0 == len(windInformation) or None == nextUpdate:
return None, None
else:
return nextUpdate, windInformation
def receiveWindData(self):
self.UpdateTime = None
self.WindData = None
with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site:
data = site.read().decode('utf-8')
site.close()
# find the pages of the GAFOR reports
pages = []
parsed = BeautifulSoup(data, features='lxml')
for link in parsed.body.find_all('a', title=True):
if 'node' in link['href'] and 'Flugwetterprognose' in link['title']:
# remove the jsession from the link
pages.append('https://www.dwd.de/' + link['href'].split(';')[0])
# receive the wind data
self.UpdateTime = None
self.WindData = {}
for page in pages:
next, wind = self.parseGaforPage(page)
if None != next:
if None == self.UpdateTime or self.UpdateTime > next:
self.UpdateTime = next
for gafor in wind:
self.WindData[gafor[0]] = gafor[1]
# indicate that new wind data is available
if None != self.UpdateTime:
return True
else:
return False