aman-sys/aman/com/DwdCrawler.py

#!/usr/bin/env python

import datetime
import time
import urllib.request

from bs4 import BeautifulSoup
from datetime import datetime as dt

# @brief Checks the DWD pages for wind information
# Format:
#   Provides next update tine (updateTime) of the DWD page in UTC
#   Provides a list of wind information (windData)
#       - organized as a list of tuples
#           - first element of tuple: GAFOR-IDs for the following wind information
#           - second element of tuple: list of tuples of wind data
#               - first element of wind data tuple: minimum altitude AMSL for this wind information
#               - second element of wind data tuple: wind direction
#               - third  element of wind data tuple: wind speed (KT)
class DwdCrawler():
    def __init__(self):
        self.UpdateTime = None
        self.WindData = None

    def parseGaforAreas(areas : str):
        areas = areas.replace(':', '')
        areas = areas.split(' ')[1]
        areaIds = []

        # some IDs are lists
        for segment in areas.split(','):
            # check if we have range definitions or single IDs
            borders = segment.split('-')
            if 2 == len(borders):
                areaIds.extend(range(int(borders[0]), int(borders[1]) + 1))
            else:
                areaIds.append(int(borders[0]))

        return areaIds

    def parseWindTableRow(row : str, table):
        # get the columns
        entries = row.split('|')

        # check if the line is invalid or we have the header
        if 2 > len(entries) or 'AMSL' in entries[0]:
            return table

        # parse the wind data
        windData = entries[1].strip().split(' ')[0].split('/')
        if 2 != len(windData):
            return table

        # extend the table
        altitude = entries[0].strip()
        if 'FL' in altitude:
            altitude = int(altitude.replace('FL', '')) * 100
        else:
            altitude = int(altitude.replace('FT', ''))
        if 'VRB' == windData[0]:
            row = ( altitude, 0, int(windData[1].replace('KT', '')) )
        else:
            row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) )
        table.append(row)

        return table

    def parseNextUpdateTime(line : str):
        entries = line.split(' ')
        if 4 <= len(entries):
            utcIndex = 2
        if 'UTC' in entries[len(entries) - 2]:
            utcIndex = len(entries) - 3
        elif 'UTC' in entries[len(entries) - 1]:
            utcIndex = len(entries - 2)

        currentUtc = dt.utcfromtimestamp(int(time.time()))
        currentHour = int(currentUtc.strftime('%H'))

        # check if we have a day overlap
        if currentHour > int(entries[utcIndex].split('.')[0]):
            nextDay = currentUtc + datetime.timedelta(days=1)
            date = nextDay.strftime('%Y-%m-%d')
        else:
            date = currentUtc.strftime('%Y-%m-%d')

        # create the new UTC update time
        return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z')

    def parseGaforPage(self, url : str):
        with urllib.request.urlopen(url) as site:
            data = site.read().decode('utf-8')
            site.close()

            parsed = BeautifulSoup(data, features='lxml')

            # search the info about the GAFOR areas
            content = None
            for element in parsed.body.find_all('pre'):
                content = element.text

            # analyze the received data
            if None != content:
                windInformation = []
                nextUpdate = None
                windTable = []
                areaIds = None

                # find all relevant information
                for line in content.splitlines():
                    if '' == line:
                        if 0 != len(windTable):
                            windInformation.append(( areaIds, windTable ))
                        areaIds = None
                        windTable = []
                    elif line.startswith('GAFOR-Gebiete'):
                        areaIds = DwdCrawler.parseGaforAreas(line)
                        windTable = []
                    elif None != areaIds:
                        windTable = DwdCrawler.parseWindTableRow(line, windTable)
                    elif 'Aktualisierung erfolgt um ' in line:
                        nextUpdate = DwdCrawler.parseNextUpdateTime(line)

                # return the collected information
                if 0 == len(windInformation) or None == nextUpdate:
                    return None, None
                else:
                    return nextUpdate, windInformation

    def receiveWindData(self):
        self.UpdateTime = None
        self.WindData = None

        with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site:
            data = site.read().decode('utf-8')
            site.close()

            # find the pages of the GAFOR reports
            pages = []
            parsed = BeautifulSoup(data, features='lxml')
            for link in parsed.body.find_all('a', title=True):
                if 'node' in link['href'] and 'Flugwetterprognose' in link['title']:
                    # remove the jsession from the link
                    pages.append('https://www.dwd.de/' + link['href'].split(';')[0])

            # receive the wind data
            self.UpdateTime = None
            self.WindData = []
            for page in pages:
                next, wind = self.parseGaforPage(page)
                if None != next:
                    if None == self.UpdateTime or self.UpdateTime > next:
                        self.UpdateTime = next
                    self.WindData.extend(wind)

            # indicate that new wind data is available
            if None != self.UpdateTime:
                return True
            else:
                return False