# import packages
import asyncio
import aiohttp
import pandas
import os
from tqdm import tqdm
[docs]async def get_hcd(session, url):
"""
The function requests the head content disposition from an URL and returns it together with the URL if an key error
occurs only the url is returned.
Parameters
----------
session: aiohttp.client.ClientSession
the client session
url: str
The URL from which the header information should be read.
Returns
-------
head_content_disposition + "__" + url: srt
The head content disposition plus the corresponding URL as one string.
url: str
the URL
"""
# if there is no answer to the request a key error will occur
try:
# request head
async with session.head(url) as resp:
# select hcd
head_content_disposition = resp.headers["content-disposition"]
# return hcd + url
return head_content_disposition + "__" + url
# if an key error occurs return the url
except KeyError:
return url
# framework function for the request function
[docs]async def framework_requests(start=0, stop=0, list_of_ids=None):
"""
In this function first the framework for the request function is set. Then the request function is called repeatedly
to get the head content dispositions of the URLs.
Parameters
----------
start: int
The number of the id part of the first URL to be checked.
stop: int
The number of the id part of the last URL to be checked.
list_of_ids: list of str or None
This Parameter should not be changed. It only plays a role in the additional checking of the key error URLs.
Returns
-------
hcd__url_list: list of str
A list with the head content dispositions plus the corresponding URLs.
url_key_error_list: list of str
A list with all URLs where a key error occurred.
"""
# set timeout time / time limit
timeout = aiohttp.ClientTimeout(total=12000)
# create client session so that not every request will open an new connection
async with aiohttp.ClientSession(timeout=timeout) as session:
# create a tasks, hcd + url and only url(KeyError) list
tasks = list()
hcd__url_list = list()
url_key_error_list = list()
# check if there is a list of ids
if list_of_ids is not None:
# loop through the list
for number in list_of_ids:
# set url
url = "https://geoportal.geoportal-th.de/gaialight-th/_apps/dladownload/download.php?type=op&id=" + \
str(number)
# append task to tasks list asyncio.ensure_future schedules the execution of the task
tasks.append(asyncio.ensure_future(get_hcd(session, url)))
# if there is no list there should be a range of values
else:
for number in range(start, stop):
url = "https://geoportal.geoportal-th.de/gaialight-th/_apps/dladownload/download.php?type=op&id=" + \
str(number)
tasks.append(asyncio.ensure_future(get_hcd(session, url)))
# gather all Future tasks(executions of request function) and wait till they are finished
hcd__url_all = await asyncio.gather(*tasks)
# loop through the returns
for hcd__url in hcd__url_all:
# check if the return is hcd + url or only the url and append them to the appropriate list
if hcd__url[0] != "h":
hcd__url_list.append(hcd__url)
else:
url_key_error_list.append(hcd__url)
# return hcd + url and only url(KeyError) list
return hcd__url_list, url_key_error_list
[docs]def create_url_id_file(start, stop, out_path, number_of_requests=100):
"""
This function creates CSV files with the id part ot the URL, the year of data acquisition and the tile number or
only the URL (if it is a key error URL) as columns.To achieve this, first the framework function is called to get
the head content dispositions or in case of a key error the URL. Then the information is filtered and the CSV files
are generated. Finally, the content of all non-key error CSV files is merged into one file.
Parameters
----------
start: int
The number of the id part of the first URL to be checked.
stop: int
The number of the id part of the last URL to be checked.
out_path: str
Path to the folder where the output should be stored.
number_of_requests: int
Maximum number of concurrent requests (if the performance is not important, the default value should be kept).
Returns
-------
path_name: str
The path to the URL id file.
"""
# prevents RuntimeError: Event loop is closed
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
for i1 in tqdm(range(start, stop, number_of_requests)):
# run framework function
hcd__url_re_list, url_key_error_re_list = asyncio.run(framework_requests(start=i1, stop=i1+number_of_requests))
# cause the urls are sometimes not reachable for a short time
# every url that is unreachable is checked for a second an if necessary even a third time
for i2 in range(0, 2):
if len(url_key_error_re_list) > 0:
# create list of error url ids
error_url_id_list = list()
# get url ids
for error_url_re in url_key_error_re_list:
error_url_id = ""
for i3 in range(6, 1, -1):
if error_url_re[len(error_url_re) - i3:].isdigit() is True:
error_url_id = int(error_url_re[len(error_url_re) - i3:])
break
# append url id to url id list
error_url_id_list.append(error_url_id)
# run framework function with the error url id list as input
hcd__url_re_list2, url_key_error_re_list = \
asyncio.run(framework_requests(list_of_ids=error_url_id_list))
# add hcd + ulr to the already existing hcd + ulr list
hcd__url_re_list = hcd__url_re_list + hcd__url_re_list2
# if the returned hcd + url list is not empty loop though the list
if len(hcd__url_re_list) != 0:
# create a list for the url ids, the years and the tile numbers
url_id_list = list()
year_list = list()
tile_number_list = list()
# get url id
for hcd__url_re in hcd__url_re_list:
url_id = ""
for i4 in range(6, 1, -1):
if hcd__url_re[len(hcd__url_re) - i4:].isdigit() is True:
url_id = int(hcd__url_re[len(hcd__url_re) - i4:])
break
# get year
th_position = hcd__url_re.find("th")
year = hcd__url_re[th_position + 3:th_position + 7]
# get tile number
_32_position = hcd__url_re.find("_32_")
if _32_position != -1:
tile_number = hcd__url_re[_32_position + 4:_32_position + 12]
else:
_32_position = hcd__url_re.find("_32")
tile_number = hcd__url_re[_32_position + 3:_32_position + 11]
# sort out some data that is not relevant
if tile_number == "tachment":
continue
# append url ids an so on to the appropriate list
url_id_list.append(url_id)
year_list.append(year)
tile_number_list.append(tile_number)
# create df with the url id, year and tile number as columns
data = {"url_id": url_id_list, "year": year_list, "tile_number": tile_number_list}
url_id_df = pandas.DataFrame(data)
# set name for the csv file
name = str(i1) + "_" + str(i1+number_of_requests) + "_url_id_file.csv"
# export df as csv
url_id_df.to_csv(out_path+name, index=False)
# if url(KeyError) list is not empty export a csv data with the "error" urls
if len(url_key_error_re_list) != 0:
error_data = {"error_urls": url_key_error_re_list}
error_df = pandas.DataFrame(error_data)
name = str(i1) + "_" + str(i1+number_of_requests) + "_url_KeyError.csv"
error_df.to_csv(out_path+name, index=False)
# create list for the filenames
filenames_list = list()
# loop through files in the out_path directory
for file in os.scandir(out_path):
# convert to string cause argument of type 'nt.DirEntry' is not iterable
filename = str(file)
# check if the file is relevant
if "KeyError" not in filename and "url_id_file" in filename:
# add filename to filename list
filenames_list.append(filename[11:len(filename)-2])
try:
# combine the content of all the files
combined_content = pandas.concat([pandas.read_csv(out_path+filename) for filename in filenames_list])
# create a csv file with the combined content
path_name = out_path+"url_id_file.csv"
combined_content.to_csv(path_name, index=False)
return path_name
except ValueError:
print("There are no objects to concatenate. Probably there are no relevant files.")