Browse Source

add a crawler to parse DWD data

Sven Czarnian 3 years ago
parent
commit
d851efcd4d
1 changed files with 157 additions and 0 deletions
  1. 157 0
      aman/com/DwdCrawler.py

+ 157 - 0
aman/com/DwdCrawler.py

@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+
+import datetime
+import time
+import urllib.request
+
+from bs4 import BeautifulSoup
+from datetime import datetime as dt
+from threading import Thread
+
+# @brief Checks the DWD pages for wind information
+# Format:
+#   Provides next update tine (updateTime) of the DWD page in UTC
+#   Provides a list of wind information (windData)
+#       - organized as a list of tuples
+#           - first element of tuple: GAFOR-IDs for the following wind information
+#           - second element of tuple: list of tuples of wind data
+#               - first element of wind data tuple: minimum altitude AMSL for this wind information
+#               - second element of wind data tuple: wind direction
+#               - third  element of wind data tuple: wind speed (KT)
+class DwdCrawler(Thread):
+    def __init__(self):
+        Thread.__init__(self)
+        self.dataAvailable = False
+        self.executing = True
+        self.start()
+
+    def parseGaforAreas(areas : str):
+        areas = areas.replace(':', '')
+        areas = areas.split(' ')[1]
+        areaIds = []
+
+        # some IDs are lists
+        for segment in areas.split(','):
+            # check if we have range definitions or single IDs
+            borders = segment.split('-')
+            if 2 == len(borders):
+                areaIds.extend(range(int(borders[0]), int(borders[1]) + 1))
+            else:
+                areaIds.append(int(borders[0]))
+
+        return areaIds
+
+    def parseWindTableRow(row : str, table):
+        # get the columns
+        entries = row.split('|')
+
+        # check if the line is invalid or we have the header
+        if 2 > len(entries) or 'AMSL' in entries[0]:
+            return table
+
+        # parse the wind data
+        windData = entries[1].strip().split(' ')[0].split('/')
+        if 2 != len(windData):
+            return table
+
+        # extend the table
+        altitude = entries[0].strip()
+        if 'FL' in altitude:
+            altitude = int(altitude.replace('FL', '')) * 100
+        else:
+            altitude = int(altitude.replace('FT', ''))
+        row = ( altitude, int(windData[0]), int(windData[1].replace('KT', '')) )
+        table.append(row)
+
+        return table
+
+    def parseNextUpdateTime(line : str):
+        entries = line.split(' ')
+        if 4 <= len(entries):
+            utcIndex = 2
+        if 'UTC' in entries[len(entries) - 2]:
+            utcIndex = len(entries) - 3
+        elif 'UTC' in entries[len(entries) - 1]:
+            utcIndex = len(entries - 2)
+
+        currentUtc = dt.utcfromtimestamp(int(time.time()))
+        currentHour = int(currentUtc.strftime('%H'))
+
+        # check if we have a day overlap
+        if currentHour > int(entries[utcIndex].split('.')[0]):
+            nextDay = currentUtc + datetime.timedelta(days=1)
+            date = nextDay.strftime('%Y-%m-%d')
+        else:
+            date = currentUtc.strftime('%Y-%m-%d')
+
+        # create the new UTC update time
+        return dt.strptime(date + ' ' + entries[utcIndex] + '+0000', '%Y-%m-%d %H.%M%z')
+
+    def parseGaforPage(self, url : str):
+        with urllib.request.urlopen(url) as site:
+            data = site.read().decode('utf-8')
+            site.close()
+
+            parsed = BeautifulSoup(data, features='lxml')
+
+            # search the info about the GAFOR areas
+            content = None
+            for element in parsed.body.find_all('pre'):
+                content = element.text
+
+            # analyze the received data
+            if None != content:
+                windInformation = []
+                udpdateTime = None
+                windTable = []
+                areaIds = None
+
+                # find all relevant information
+                for line in content.splitlines():
+                    if '' == line:
+                        if 0 != len(windTable):
+                            windInformation.append(( areaIds, windTable ))
+                        areaIds = None
+                        windTable = []
+                    elif line.startswith('GAFOR-Gebiete'):
+                        areaIds = DwdCrawler.parseGaforAreas(line)
+                        windTable = []
+                    elif None != areaIds:
+                        windTable = DwdCrawler.parseWindTableRow(line, windTable)
+                    elif 'Aktualisierung erfolgt um ' in line:
+                        updateTime = DwdCrawler.parseNextUpdateTime(line)
+
+                # return the collected information
+                if 0 != len(windInformation) and None != updateTime:
+                    return updateTime, windInformation
+                else:
+                    return None
+
+    def run(self):
+        with urllib.request.urlopen('https://www.dwd.de/DE/fachnutzer/luftfahrt/teaser/luftsportberichte/luftsportberichte_node.html') as site:
+            data = site.read().decode('utf-8')
+            site.close()
+
+            # find the pages of the GAFOR reports
+            pages = []
+            parsed = BeautifulSoup(data, features='lxml')
+            for link in parsed.body.find_all('a', title=True):
+                if 'node' in link['href'] and 'Flugwetterprognose' in link['title']:
+                    # remove the jsession from the link
+                    pages.append('https://www.dwd.de/' + link['href'].split(';')[0])
+
+            # receive the wind data
+            self.updateTime = None
+            self.windData = []
+            for page in pages:
+                next, wind = self.parseGaforPage(page)
+                if None != next:
+                    if None == self.updateTime or self.updateTime > next:
+                        self.updateTime = next
+                    self.windData.extend(wind)
+
+            # indicate that new wind data is available
+            if None != self.updateTime:
+                self.dataAvailable = True
+
+        self.executing = False