2022-07-10 11:14:47 +02:00
|
|
|
from datetime import datetime, date, timedelta
|
2026-01-19 12:02:37 +01:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
2022-07-10 11:14:47 +02:00
|
|
|
|
|
|
|
|
import click
|
|
|
|
|
|
2022-07-10 12:17:16 +02:00
|
|
|
import yfinance as yf
|
2025-05-02 16:06:33 +02:00
|
|
|
from curl_cffi import requests # Without its Session object, yahoo will reject requests with YFRateLimitError
|
2022-07-10 11:14:47 +02:00
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Download quotes from Yahoo
|
|
|
|
|
"""
|
|
|
|
|
|
2026-02-06 12:36:12 +01:00
|
|
|
def download_klines(config, data_sources):
|
2022-07-10 11:14:47 +02:00
|
|
|
"""
|
|
|
|
|
"""
|
2026-01-19 12:02:37 +01:00
|
|
|
time_column = config["time_column"]
|
|
|
|
|
data_path = Path(config["data_folder"])
|
|
|
|
|
download_max_rows = config.get("download_max_rows", 0)
|
2022-07-10 11:14:47 +02:00
|
|
|
|
|
|
|
|
now = datetime.now()
|
|
|
|
|
|
2025-05-02 16:06:33 +02:00
|
|
|
# This session will be used in all requests to avoid YFRateLimitError
|
|
|
|
|
session = requests.Session(impersonate="chrome")
|
|
|
|
|
|
2022-07-16 10:09:56 +02:00
|
|
|
for ds in data_sources:
|
2022-07-10 11:14:47 +02:00
|
|
|
# Assumption: folder name is equal to the symbol name we want to download
|
2022-07-16 10:09:56 +02:00
|
|
|
quote = ds.get("folder")
|
2022-07-10 11:14:47 +02:00
|
|
|
if not quote:
|
|
|
|
|
print(f"ERROR. Folder is not specified.")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# If file name is not specified then use symbol name as file name
|
2022-07-16 10:09:56 +02:00
|
|
|
file = ds.get("file", quote)
|
2022-07-10 11:14:47 +02:00
|
|
|
if not file:
|
|
|
|
|
file = quote
|
|
|
|
|
|
|
|
|
|
print(f"Start downloading '{quote}' ...")
|
|
|
|
|
|
2023-08-19 15:43:04 +02:00
|
|
|
file_path = data_path / quote
|
|
|
|
|
file_path.mkdir(parents=True, exist_ok=True) # Ensure that folder exists
|
2022-07-10 11:14:47 +02:00
|
|
|
|
2023-08-19 15:43:04 +02:00
|
|
|
file_name = (file_path / file).with_suffix(".csv")
|
|
|
|
|
|
|
|
|
|
if file_name.is_file():
|
2023-08-24 20:24:11 +02:00
|
|
|
df = pd.read_csv(file_name, parse_dates=[time_column], date_format="ISO8601")
|
|
|
|
|
#df['Date'] = pd.to_datetime(df['Date'], format="ISO8601") # "2022-06-07" iso format
|
2022-07-31 11:55:54 +02:00
|
|
|
df[time_column] = df[time_column].dt.date
|
|
|
|
|
last_date = df.iloc[-1][time_column]
|
2022-07-10 11:14:47 +02:00
|
|
|
|
2025-09-07 11:08:25 +02:00
|
|
|
overlap = 2 # The overlap can be longer because the difference in days includes also weekends which are not trade days
|
|
|
|
|
days = (pd.Timestamp(now) - pd.Timestamp(last_date)).days + overlap
|
|
|
|
|
|
2022-07-10 11:14:47 +02:00
|
|
|
# === Download from the remote server
|
2024-05-11 15:45:51 +02:00
|
|
|
# Download more data than we need and then overwrite the older data
|
2025-09-07 11:08:25 +02:00
|
|
|
new_df = yf.download(quote, period=f"{days}d", auto_adjust=True, multi_level_index=False, session=session)
|
2022-07-10 11:14:47 +02:00
|
|
|
|
|
|
|
|
new_df = new_df.reset_index()
|
2025-11-02 12:51:54 +01:00
|
|
|
new_df['Date'] = pd.to_datetime(new_df['Date'], format="ISO8601", utc=True).dt.date
|
2024-05-11 15:45:51 +02:00
|
|
|
#del new_df['Close']
|
|
|
|
|
#new_df.rename({'Adj Close': 'Close'}, axis=1, inplace=True)
|
|
|
|
|
new_df.rename({'Date': time_column}, axis=1, inplace=True)
|
2022-07-10 12:17:16 +02:00
|
|
|
new_df.columns = new_df.columns.str.lower()
|
|
|
|
|
|
2022-07-10 11:14:47 +02:00
|
|
|
df = pd.concat([df, new_df])
|
2022-07-16 10:09:56 +02:00
|
|
|
df = df.drop_duplicates(subset=[time_column], keep="last")
|
2022-07-10 11:14:47 +02:00
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
print(f"File not found. Full fetch...")
|
2022-07-10 12:17:16 +02:00
|
|
|
|
|
|
|
|
# === Download from the remote server
|
2024-12-14 10:21:56 +01:00
|
|
|
#df = yf.download(quote, date(1990, 1, 1), auto_adjust=True, multi_level_index=False)
|
2025-05-02 16:06:33 +02:00
|
|
|
df = yf.download(quote, period="max", auto_adjust=True, multi_level_index=False, session=session)
|
2022-07-10 12:17:16 +02:00
|
|
|
|
2022-07-10 11:14:47 +02:00
|
|
|
df = df.reset_index()
|
2025-11-02 12:51:54 +01:00
|
|
|
df['Date'] = pd.to_datetime(df['Date'], format="ISO8601", utc=True).dt.date
|
2024-05-11 15:45:51 +02:00
|
|
|
#del df['Close']
|
|
|
|
|
#df.rename({'Adj Close': 'Close'}, axis=1, inplace=True)
|
|
|
|
|
df.rename({'Date': time_column}, axis=1, inplace=True)
|
2022-07-10 12:17:16 +02:00
|
|
|
df.columns = df.columns.str.lower()
|
|
|
|
|
|
2022-07-10 11:14:47 +02:00
|
|
|
print(f"Full fetch finished.")
|
|
|
|
|
|
2022-07-16 10:09:56 +02:00
|
|
|
df = df.sort_values(by=time_column)
|
2022-07-10 11:14:47 +02:00
|
|
|
|
2025-03-07 18:07:43 +01:00
|
|
|
# Limit the saved size by only the latest rows
|
|
|
|
|
if download_max_rows:
|
|
|
|
|
df = df.tail(download_max_rows)
|
|
|
|
|
|
2023-08-19 15:43:04 +02:00
|
|
|
df.to_csv(file_name, index=False)
|
2022-07-10 11:14:47 +02:00
|
|
|
|
2026-01-19 12:02:37 +01:00
|
|
|
print(f"Finished downloading '{quote}'. Stored {len(df)} rows in '{file_name}'")
|