Extract
Extractยค
As a first step, we will download the all the data files from the official website of Rideindego.
The web page contains multiple links to the file. We will get all the links to the data files using DataLinkHTMLExtractor
defined in utils.fetch
. We will then run through each link and download the zip file using DataDownloader
.
import os
import click
import simplejson as json
from dotenv import load_dotenv
from loguru import logger
from utils.fetch import DataDownloader, DataLinkHTMLExtractor
from utils.fetch import get_page_html as _get_page_html
from haferml.blend.config import Config
from haferml.sync.local import prepare_folders
load_dotenv()
@click.command()
@click.option(
"-c",
"--config",
type=str,
default=os.getenv("CONFIG_FILE"),
help="Path to config file",
)
def extract(config):
base_folder = os.getenv("BASE_FOLDER")
_CONFIG = Config(config, base_folder=base_folder)
etl_trip_data_config = _CONFIG.get(["etl", "raw", "trip_data"])
logger.info(f"Using config: {etl_trip_data_config}")
# create folders
prepare_folders(etl_trip_data_config["local"], base_folder)
# if not os.path.exists(etl_trip_data_config["local"]):
# os.makedirs(etl_trip_data_config["local"])
# Download Raw Data
source_link = etl_trip_data_config["source"]
logger.info(f"Will download from {source_link}")
page = _get_page_html(source_link).get("data", {})
page_extractor = DataLinkHTMLExtractor(page)
links = page_extractor.get_data_links()
logger.info(f"Extracted links from {source_link}: {links}")
# Download data
dld = DataDownloader(links, data_type="zip", folder=etl_trip_data_config["local_absolute"])
dld.run()
if __name__ == "__main__":
extract()