add a crawler to parse DWD data
This commit is contained in:
157
aman/com/DwdCrawler.py
Normal file
157
aman/com/DwdCrawler.py
Normal file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import datetime
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime as dt
|
||||
from threading import Thread
|
||||
|
||||
# @brief Checks the DWD pages for wind information
|
||||
# Format:
|
||||
# Provides next update tine (updateTime) of the DWD page in UTC
|
||||
# Provides a list of wind information (windData)
|
||||
# - organized as a list of tuples
|
||||
# - first element of tuple: GAFOR-IDs for the following wind information
|
||||
# - second element of tuple: list of tuples of wind data
|
||||
# - first element of wind data tuple: minimum altitude AMSL for this wind information
|
||||
# - second element of wind data tuple: wind direction
|
||||
# - third element of wind data tuple: wind speed (KT)
|
||||
class DwdCrawler(Thread):
|
||||
def __init__(self):
|
||||
Thread.__init__(self)
|
||||
self.dataAvailable = False
|
||||
self.executing = True
|
||||
self.start()
|
||||
|
||||
def parseGaforAreas(areas : str):
|
||||
areas = areas.replace(':', '')
|
||||
areas = areas.split(' ')[1]
|
||||
areaIds = []
|
||||
|
||||
# some IDs are lists
|
||||
for segment in areas.split(','):
|
||||
# check if we have range definitions or single IDs
|
||||
borders = segment.split('-')
|
||||
if 2 == len(borders):
|
||||
areaIds.extend(range(int(borders[0]), int(borders[1]) + 1))
|
||||
else:
|
||||
areaIds.append(int(borders[0]))
|
||||
|
||||
return areaIds
|
||||
|
||||
def parseWindTableRow(row : str, table):
|
||||
# get the columns
|
||||
entries = row.split('|')
|
||||
|
||||
# check if the line is invalid or we have the header
|
||||
if 2 > len(entries) or 'AMSL' in entries[0]:
|
||||
return table
|
||||
|
||||
# parse the wind data
|
||||
windData = entries[1].strip().split(' ')[0].split('/')
|
||||
if 2 != len(windData):
|
||||
return table
|
||||
|
||||
# extend the table
|
||||
altitude = entries[0].strip()
|
||||
if 'FL' in altitude:
|
||||
altitude = int(altitude.replace('FL', '')) * 100
|
||||
else:
|
||||
altitude = int(altitude.replace('FT', ''))
|
||||
row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) )
|
||||
table.append(row)
|
||||
|
||||
return table
|
||||
|
||||
def parseNextUpdateTime(line : str):
|
||||
entries = line.split(' ')
|
||||
if 4 <= len(entries):
|
||||
utcIndex = 2
|
||||
if 'UTC' in entries[len(entries) - 2]:
|
||||
utcIndex = len(entries) - 3
|
||||
elif 'UTC' in entries[len(entries) - 1]:
|
||||
utcIndex = len(entries - 2)
|
||||
|
||||
currentUtc = dt.utcfromtimestamp(int(time.time()))
|
||||
currentHour = int(currentUtc.strftime('%H'))
|
||||
|
||||
# check if we have a day overlap
|
||||
if currentHour > int(entries[utcIndex].split('.')[0]):
|
||||
nextDay = currentUtc + datetime.timedelta(days=1)
|
||||
date = nextDay.strftime('%Y-%m-%d')
|
||||
else:
|
||||
date = currentUtc.strftime('%Y-%m-%d')
|
||||
|
||||
# create the new UTC update time
|
||||
return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z')
|
||||
|
||||
def parseGaforPage(self, url : str):
|
||||
with urllib.request.urlopen(url) as site:
|
||||
data = site.read().decode('utf-8')
|
||||
site.close()
|
||||
|
||||
parsed = BeautifulSoup(data, features='lxml')
|
||||
|
||||
# search the info about the GAFOR areas
|
||||
content = None
|
||||
for element in parsed.body.find_all('pre'):
|
||||
content = element.text
|
||||
|
||||
# analyze the received data
|
||||
if None != content:
|
||||
windInformation = []
|
||||
udpdateTime = None
|
||||
windTable = []
|
||||
areaIds = None
|
||||
|
||||
# find all relevant information
|
||||
for line in content.splitlines():
|
||||
if '' == line:
|
||||
if 0 != len(windTable):
|
||||
windInformation.append(( areaIds, windTable ))
|
||||
areaIds = None
|
||||
windTable = []
|
||||
elif line.startswith('GAFOR-Gebiete'):
|
||||
areaIds = DwdCrawler.parseGaforAreas(line)
|
||||
windTable = []
|
||||
elif None != areaIds:
|
||||
windTable = DwdCrawler.parseWindTableRow(line, windTable)
|
||||
elif 'Aktualisierung erfolgt um ' in line:
|
||||
updateTime = DwdCrawler.parseNextUpdateTime(line)
|
||||
|
||||
# return the collected information
|
||||
if 0 != len(windInformation) and None != updateTime:
|
||||
return updateTime, windInformation
|
||||
else:
|
||||
return None
|
||||
|
||||
def run(self):
|
||||
with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site:
|
||||
data = site.read().decode('utf-8')
|
||||
site.close()
|
||||
|
||||
# find the pages of the GAFOR reports
|
||||
pages = []
|
||||
parsed = BeautifulSoup(data, features='lxml')
|
||||
for link in parsed.body.find_all('a', title=True):
|
||||
if 'node' in link['href'] and 'Flugwetterprognose' in link['title']:
|
||||
# remove the jsession from the link
|
||||
pages.append('https://www.dwd.de/' + link['href'].split(';')[0])
|
||||
|
||||
# receive the wind data
|
||||
self.updateTime = None
|
||||
self.windData = []
|
||||
for page in pages:
|
||||
next, wind = self.parseGaforPage(page)
|
||||
if None != next:
|
||||
if None == self.updateTime or self.updateTime > next:
|
||||
self.updateTime = next
|
||||
self.windData.extend(wind)
|
||||
|
||||
# indicate that new wind data is available
|
||||
if None != self.updateTime:
|
||||
self.dataAvailable = True
|
||||
|
||||
self.executing = False
|
||||
Reference in New Issue
Block a user