intelligent-trading-bot/inputs/collector_yahoo.py

from datetime import datetime, date, timedelta
from pathlib import Path

import pandas as pd

import click

import yfinance as yf
from curl_cffi import requests  # Without its Session object, yahoo will reject requests with YFRateLimitError

"""
Download quotes from Yahoo
"""

def download_klines(config, data_sources):
    """
    """
    time_column = config["time_column"]
    data_path = Path(config["data_folder"])
    download_max_rows = config.get("download_max_rows", 0)

    now = datetime.now()

    # This session will be used in all requests to avoid YFRateLimitError
    session = requests.Session(impersonate="chrome")

    for ds in data_sources:
        # Assumption: folder name is equal to the symbol name we want to download
        quote = ds.get("folder")
        if not quote:
            print(f"ERROR. Folder is not specified.")
            continue

        # If file name is not specified then use symbol name as file name
        file = ds.get("file", quote)
        if not file:
            file = quote

        print(f"Start downloading '{quote}' ...")

        file_path = data_path / quote
        file_path.mkdir(parents=True, exist_ok=True)  # Ensure that folder exists

        file_name = (file_path / file).with_suffix(".csv")

        if file_name.is_file():
            df = pd.read_csv(file_name, parse_dates=[time_column], date_format="ISO8601")
            #df['Date'] = pd.to_datetime(df['Date'], format="ISO8601")  # "2022-06-07" iso format
            df[time_column] = df[time_column].dt.date
            last_date = df.iloc[-1][time_column]

            overlap = 2  # The overlap can be longer because the difference in days includes also weekends which are not trade days
            days = (pd.Timestamp(now) - pd.Timestamp(last_date)).days + overlap

            # === Download from the remote server
            # Download more data than we need and then overwrite the older data
            new_df = yf.download(quote, period=f"{days}d", auto_adjust=True, multi_level_index=False, session=session)

            new_df = new_df.reset_index()
            new_df['Date'] = pd.to_datetime(new_df['Date'], format="ISO8601", utc=True).dt.date
            #del new_df['Close']
            #new_df.rename({'Adj Close': 'Close'}, axis=1, inplace=True)
            new_df.rename({'Date': time_column}, axis=1, inplace=True)
            new_df.columns = new_df.columns.str.lower()

            df = pd.concat([df, new_df])
            df = df.drop_duplicates(subset=[time_column], keep="last")

        else:
            print(f"File not found. Full fetch...")

            # === Download from the remote server
            #df = yf.download(quote, date(1990, 1, 1), auto_adjust=True, multi_level_index=False)
            df = yf.download(quote, period="max", auto_adjust=True, multi_level_index=False, session=session)

            df = df.reset_index()
            df['Date'] = pd.to_datetime(df['Date'], format="ISO8601", utc=True).dt.date
            #del df['Close']
            #df.rename({'Adj Close': 'Close'}, axis=1, inplace=True)
            df.rename({'Date': time_column}, axis=1, inplace=True)
            df.columns = df.columns.str.lower()

            print(f"Full fetch finished.")

        df = df.sort_values(by=time_column)

        # Limit the saved size by only the latest rows
        if download_max_rows:
            df = df.tail(download_max_rows)

        df.to_csv(file_name, index=False)

        print(f"Finished downloading '{quote}'. Stored {len(df)} rows in '{file_name}'")
add yahoo downloader 2022-07-10 11:14:47 +02:00			`from datetime import datetime, date, timedelta`
make downloader scrript venue-aware and move venue-specific downloaders to separate modules 2026-01-19 12:02:37 +01:00			`from pathlib import Path`

			`import pandas as pd`
add yahoo downloader 2022-07-10 11:14:47 +02:00
			`import click`

minor adjustments in yahoo downloader 2022-07-10 12:17:16 +02:00			`import yfinance as yf`
impersonate requests to yahoo to avoid YFRateLimitError 2025-05-02 16:06:33 +02:00			`from curl_cffi import requests # Without its Session object, yahoo will reject requests with YFRateLimitError`
add yahoo downloader 2022-07-10 11:14:47 +02:00
			`"""`
			`Download quotes from Yahoo`
			`"""`

move download functions to the modules with other collector functions 2026-02-06 12:36:12 +01:00			`def download_klines(config, data_sources):`
add yahoo downloader 2022-07-10 11:14:47 +02:00			`"""`
			`"""`
make downloader scrript venue-aware and move venue-specific downloaders to separate modules 2026-01-19 12:02:37 +01:00			`time_column = config["time_column"]`
			`data_path = Path(config["data_folder"])`
			`download_max_rows = config.get("download_max_rows", 0)`
add yahoo downloader 2022-07-10 11:14:47 +02:00
			`now = datetime.now()`

impersonate requests to yahoo to avoid YFRateLimitError 2025-05-02 16:06:33 +02:00			`# This session will be used in all requests to avoid YFRateLimitError`
			`session = requests.Session(impersonate="chrome")`

merge data daily raster and improvements 2022-07-16 10:09:56 +02:00			`for ds in data_sources:`
add yahoo downloader 2022-07-10 11:14:47 +02:00			`# Assumption: folder name is equal to the symbol name we want to download`
merge data daily raster and improvements 2022-07-16 10:09:56 +02:00			`quote = ds.get("folder")`
add yahoo downloader 2022-07-10 11:14:47 +02:00			`if not quote:`
			`print(f"ERROR. Folder is not specified.")`
			`continue`

			`# If file name is not specified then use symbol name as file name`
merge data daily raster and improvements 2022-07-16 10:09:56 +02:00			`file = ds.get("file", quote)`
add yahoo downloader 2022-07-10 11:14:47 +02:00			`if not file:`
			`file = quote`

			`print(f"Start downloading '{quote}' ...")`

small improvements in downloaders for file and path handling 2023-08-19 15:43:04 +02:00			`file_path = data_path / quote`
			`file_path.mkdir(parents=True, exist_ok=True) # Ensure that folder exists`
add yahoo downloader 2022-07-10 11:14:47 +02:00
small improvements in downloaders for file and path handling 2023-08-19 15:43:04 +02:00			`file_name = (file_path / file).with_suffix(".csv")`

			`if file_name.is_file():`
add ISO date format to all date parsers and readers 2023-08-24 20:24:11 +02:00			`df = pd.read_csv(file_name, parse_dates=[time_column], date_format="ISO8601")`
			`#df['Date'] = pd.to_datetime(df['Date'], format="ISO8601") # "2022-06-07" iso format`
fix errors in yahoo downloader 2022-07-31 11:55:54 +02:00			`df[time_column] = df[time_column].dt.date`
			`last_date = df.iloc[-1][time_column]`
add yahoo downloader 2022-07-10 11:14:47 +02:00
improve yahoo downloading by determing the number of missing lines 2025-09-07 11:08:25 +02:00			`overlap = 2 # The overlap can be longer because the difference in days includes also weekends which are not trade days`
			`days = (pd.Timestamp(now) - pd.Timestamp(last_date)).days + overlap`

add yahoo downloader 2022-07-10 11:14:47 +02:00			`# === Download from the remote server`
improvements in yahoo downloader 2024-05-11 15:45:51 +02:00			`# Download more data than we need and then overwrite the older data`
improve yahoo downloading by determing the number of missing lines 2025-09-07 11:08:25 +02:00			`new_df = yf.download(quote, period=f"{days}d", auto_adjust=True, multi_level_index=False, session=session)`
add yahoo downloader 2022-07-10 11:14:47 +02:00
			`new_df = new_df.reset_index()`
refactor analyzer 2025-11-02 12:51:54 +01:00			`new_df['Date'] = pd.to_datetime(new_df['Date'], format="ISO8601", utc=True).dt.date`
improvements in yahoo downloader 2024-05-11 15:45:51 +02:00			`#del new_df['Close']`
			`#new_df.rename({'Adj Close': 'Close'}, axis=1, inplace=True)`
			`new_df.rename({'Date': time_column}, axis=1, inplace=True)`
minor adjustments in yahoo downloader 2022-07-10 12:17:16 +02:00			`new_df.columns = new_df.columns.str.lower()`

add yahoo downloader 2022-07-10 11:14:47 +02:00			`df = pd.concat([df, new_df])`
merge data daily raster and improvements 2022-07-16 10:09:56 +02:00			`df = df.drop_duplicates(subset=[time_column], keep="last")`
add yahoo downloader 2022-07-10 11:14:47 +02:00
			`else:`
			`print(f"File not found. Full fetch...")`
minor adjustments in yahoo downloader 2022-07-10 12:17:16 +02:00
			`# === Download from the remote server`
fix problem with a new version of yfinance library 2024-12-14 10:21:56 +01:00			`#df = yf.download(quote, date(1990, 1, 1), auto_adjust=True, multi_level_index=False)`
impersonate requests to yahoo to avoid YFRateLimitError 2025-05-02 16:06:33 +02:00			`df = yf.download(quote, period="max", auto_adjust=True, multi_level_index=False, session=session)`
minor adjustments in yahoo downloader 2022-07-10 12:17:16 +02:00
add yahoo downloader 2022-07-10 11:14:47 +02:00			`df = df.reset_index()`
refactor analyzer 2025-11-02 12:51:54 +01:00			`df['Date'] = pd.to_datetime(df['Date'], format="ISO8601", utc=True).dt.date`
improvements in yahoo downloader 2024-05-11 15:45:51 +02:00			`#del df['Close']`
			`#df.rename({'Adj Close': 'Close'}, axis=1, inplace=True)`
			`df.rename({'Date': time_column}, axis=1, inplace=True)`
minor adjustments in yahoo downloader 2022-07-10 12:17:16 +02:00			`df.columns = df.columns.str.lower()`

add yahoo downloader 2022-07-10 11:14:47 +02:00			`print(f"Full fetch finished.")`

merge data daily raster and improvements 2022-07-16 10:09:56 +02:00			`df = df.sort_values(by=time_column)`
add yahoo downloader 2022-07-10 11:14:47 +02:00
limit download size in downloaders 2025-03-07 18:07:43 +01:00			`# Limit the saved size by only the latest rows`
			`if download_max_rows:`
			`df = df.tail(download_max_rows)`

small improvements in downloaders for file and path handling 2023-08-19 15:43:04 +02:00			`df.to_csv(file_name, index=False)`
add yahoo downloader 2022-07-10 11:14:47 +02:00
make downloader scrript venue-aware and move venue-specific downloaders to separate modules 2026-01-19 12:02:37 +01:00			`print(f"Finished downloading '{quote}'. Stored {len(df)} rows in '{file_name}'")`