igdb/code/database/parser.py

80 lines
2.0 KiB
Python
Raw Normal View History

from iso3166 import countries as co
2020-01-06 06:34:35 +01:00
from pandas import DataFrame, concat, read_csv
from csv import QUOTE_NONNUMERIC
2020-01-08 21:31:00 +01:00
from database.constants import ADMIN_PW
2020-01-08 22:14:16 +01:00
from os import path
def country_conversion(political_unit) -> str:
2020-01-06 11:12:55 +01:00
if political_unit == "99":
return "99"
codes = co.get(political_unit)
return codes.name
def select_columns() -> DataFrame:
min_year = 2010
fields = [
"POLITICAL_UNIT",
2020-01-06 08:31:21 +01:00
"NAME",
"WGMS_ID",
"YEAR",
2020-01-06 11:12:55 +01:00
"MEDIAN_ELEVATION",
"AREA",
"LENGTH",
]
iter_csv = read_csv(
2020-01-06 11:12:55 +01:00
"../data/WGMS-FoG-2019-12-B-STATE.csv",
skipinitialspace=True,
usecols=fields,
iterator=True,
chunksize=100,
2020-01-06 06:34:35 +01:00
converters={"POLITICAL_UNIT": country_conversion},
)
data = concat([chunk[chunk["YEAR"] > min_year] for chunk in iter_csv])
return data
2020-01-06 06:34:35 +01:00
2020-01-08 01:23:00 +01:00
def rename_fields(df_list):
new_df_list = {}
new_fields = {
"POLITICAL_UNIT": "country",
"NAME": "name",
"WGMS_ID": "uid",
"YEAR": "year",
"MEDIAN_ELEVATION": "elevation",
"AREA": "surface",
"LENGTH": "length",
}
for key, value in df_list.items():
new_df_list[key] = value.rename(columns=new_fields)
return new_df_list
2020-01-06 08:31:21 +01:00
def create_databases(df):
files = {
"glacier": "../data/glacier.csv",
2020-01-08 01:47:12 +01:00
"annual_data": "../data/annual_data.csv",
"user": "../data/user.csv",
2020-01-06 08:31:21 +01:00
}
2020-01-08 22:14:16 +01:00
user = {"uid": [7843], "username": ["admin"], "password": [ADMIN_PW]}
2020-01-06 08:31:21 +01:00
dataframes = {
"glacier": df[["POLITICAL_UNIT", "NAME", "WGMS_ID"]].drop_duplicates(),
2020-01-08 01:47:12 +01:00
"annual_data": df[["WGMS_ID", "YEAR", "AREA", "MEDIAN_ELEVATION", "LENGTH"]],
"user": DataFrame(user),
2020-01-06 08:31:21 +01:00
}
2020-01-08 01:23:00 +01:00
renamed_dfs = rename_fields(dataframes)
for key, val in renamed_dfs.items():
2020-01-08 22:14:16 +01:00
if path.isfile(files[key]):
continue
2020-01-06 08:31:21 +01:00
val.to_csv(files[key], index=False, quoting=QUOTE_NONNUMERIC)
2020-01-06 06:34:35 +01:00
def main():
df = select_columns()
2020-01-06 08:31:21 +01:00
create_databases(df)
2020-01-06 06:34:35 +01:00
if __name__ == "__main__":
main()