3 years ago · d851efcd4d
--- a/aman/com/DwdCrawler.py
+++ b/aman/com/DwdCrawler.py
@@ -0,0 +1,157 @@
 
				+#!/usr/bin/env python

			
 
				+

			
 
				+import datetime

			
 
				+import time

			
 
				+import urllib.request

			
 
				+

			
 
				+from bs4 import BeautifulSoup

			
 
				+from datetime import datetime as dt

			
 
				+from threading import Thread

			
 
				+

			
 
				+# @brief Checks the DWD pages for wind information

			
 
				+# Format:

			
 
				+#   Provides next update tine (updateTime) of the DWD page in UTC

			
 
				+#   Provides a list of wind information (windData)

			
 
				+#       - organized as a list of tuples

			
 
				+#           - first element of tuple: GAFOR-IDs for the following wind information

			
 
				+#           - second element of tuple: list of tuples of wind data

			
 
				+#               - first element of wind data tuple: minimum altitude AMSL for this wind information

			
 
				+#               - second element of wind data tuple: wind direction

			
 
				+#               - third  element of wind data tuple: wind speed (KT)

			
 
				+class DwdCrawler(Thread):

			
 
				+    def __init__(self):

			
 
				+        Thread.__init__(self)

			
 
				+        self.dataAvailable = False

			
 
				+        self.executing = True

			
 
				+        self.start()

			
 
				+

			
 
				+    def parseGaforAreas(areas : str):

			
 
				+        areas = areas.replace(':', '')

			
 
				+        areas = areas.split(' ')[1]

			
 
				+        areaIds = []

			
 
				+

			
 
				+        # some IDs are lists

			
 
				+        for segment in areas.split(','):

			
 
				+            # check if we have range definitions or single IDs

			
 
				+            borders = segment.split('-')

			
 
				+            if 2 == len(borders):

			
 
				+                areaIds.extend(range(int(borders[0]), int(borders[1]) + 1))

			
 
				+            else:

			
 
				+                areaIds.append(int(borders[0]))

			
 
				+

			
 
				+        return areaIds

			
 
				+

			
 
				+    def parseWindTableRow(row : str, table):

			
 
				+        # get the columns

			
 
				+        entries = row.split('|')

			
 
				+

			
 
				+        # check if the line is invalid or we have the header

			
 
				+        if 2 > len(entries) or 'AMSL' in entries[0]:

			
 
				+            return table

			
 
				+

			
 
				+        # parse the wind data

			
 
				+        windData = entries[1].strip().split(' ')[0].split('/')

			
 
				+        if 2 != len(windData):

			
 
				+            return table

			
 
				+

			
 
				+        # extend the table

			
 
				+        altitude = entries[0].strip()

			
 
				+        if 'FL' in altitude:

			
 
				+            altitude = int(altitude.replace('FL', '')) * 100

			
 
				+        else:

			
 
				+            altitude = int(altitude.replace('FT', ''))

			
 
				+        row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) )

			
 
				+        table.append(row)

			
 
				+

			
 
				+        return table

			
 
				+

			
 
				+    def parseNextUpdateTime(line : str):

			
 
				+        entries = line.split(' ')

			
 
				+        if 4 <= len(entries):

			
 
				+            utcIndex = 2

			
 
				+        if 'UTC' in entries[len(entries) - 2]:

			
 
				+            utcIndex = len(entries) - 3

			
 
				+        elif 'UTC' in entries[len(entries) - 1]:

			
 
				+            utcIndex = len(entries - 2)

			
 
				+

			
 
				+        currentUtc = dt.utcfromtimestamp(int(time.time()))

			
 
				+        currentHour = int(currentUtc.strftime('%H'))

			
 
				+

			
 
				+        # check if we have a day overlap

			
 
				+        if currentHour > int(entries[utcIndex].split('.')[0]):

			
 
				+            nextDay = currentUtc + datetime.timedelta(days=1)

			
 
				+            date = nextDay.strftime('%Y-%m-%d')

			
 
				+        else:

			
 
				+            date = currentUtc.strftime('%Y-%m-%d')

			
 
				+

			
 
				+        # create the new UTC update time

			
 
				+        return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z')

			
 
				+

			
 
				+    def parseGaforPage(self, url : str):

			
 
				+        with urllib.request.urlopen(url) as site:

			
 
				+            data = site.read().decode('utf-8')

			
 
				+            site.close()

			
 
				+

			
 
				+            parsed = BeautifulSoup(data, features='lxml')

			
 
				+

			
 
				+            # search the info about the GAFOR areas

			
 
				+            content = None

			
 
				+            for element in parsed.body.find_all('pre'):

			
 
				+                content = element.text

			
 
				+

			
 
				+            # analyze the received data

			
 
				+            if None != content:

			
 
				+                windInformation = []

			
 
				+                udpdateTime = None

			
 
				+                windTable = []

			
 
				+                areaIds = None

			
 
				+

			
 
				+                # find all relevant information

			
 
				+                for line in content.splitlines():

			
 
				+                    if '' == line:

			
 
				+                        if 0 != len(windTable):

			
 
				+                            windInformation.append(( areaIds, windTable ))

			
 
				+                        areaIds = None

			
 
				+                        windTable = []

			
 
				+                    elif line.startswith('GAFOR-Gebiete'):

			
 
				+                        areaIds = DwdCrawler.parseGaforAreas(line)

			
 
				+                        windTable = []

			
 
				+                    elif None != areaIds:

			
 
				+                        windTable = DwdCrawler.parseWindTableRow(line, windTable)

			
 
				+                    elif 'Aktualisierung erfolgt um ' in line:

			
 
				+                        updateTime = DwdCrawler.parseNextUpdateTime(line)

			
 
				+

			
 
				+                # return the collected information

			
 
				+                if 0 != len(windInformation) and None != updateTime:

			
 
				+                    return updateTime, windInformation

			
 
				+                else:

			
 
				+                    return None

			
 
				+

			
 
				+    def run(self):

			
 
				+        with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site:

			
 
				+            data = site.read().decode('utf-8')

			
 
				+            site.close()

			
 
				+

			
 
				+            # find the pages of the GAFOR reports

			
 
				+            pages = []

			
 
				+            parsed = BeautifulSoup(data, features='lxml')

			
 
				+            for link in parsed.body.find_all('a', title=True):

			
 
				+                if 'node' in link['href'] and 'Flugwetterprognose' in link['title']:

			
 
				+                    # remove the jsession from the link

			
 
				+                    pages.append('https://www.dwd.de/' + link['href'].split(';')[0])

			
 
				+

			
 
				+            # receive the wind data

			
 
				+            self.updateTime = None

			
 
				+            self.windData = []

			
 
				+            for page in pages:

			
 
				+                next, wind = self.parseGaforPage(page)

			
 
				+                if None != next:

			
 
				+                    if None == self.updateTime or self.updateTime > next:

			
 
				+                        self.updateTime = next

			
 
				+                    self.windData.extend(wind)

			
 
				+

			
 
				+            # indicate that new wind data is available

			
 
				+            if None != self.updateTime:

			
 
				+                self.dataAvailable = True

			
 
				+

			
 
				+        self.executing = False