mql5/Experts/Advisors/DualEA/ML/dataset.py

141 lines
5.4 KiB
Python
Raw Permalink Normal View History

2025-08-10 17:43:21 -04:00
import os
import pandas as pd
from typing import Tuple
# Default MT5 Common Files path on Windows
DEFAULT_COMMON = os.path.join(os.environ.get("APPDATA", ""), "MetaQuotes", "Terminal", "Common", "Files", "DualEA")
FEATURES_NAME = "features.csv"
def resolve_features_path(common_dir: str = None) -> str:
base = common_dir or DEFAULT_COMMON
return os.path.join(base, FEATURES_NAME)
def _read_csv_robust(path: str) -> pd.DataFrame:
encodings = [
"utf-8",
"utf-8-sig",
"utf-16",
"utf-16-le",
"utf-16-be",
"cp1252",
"latin1",
]
last_exc = None
for enc in encodings:
try:
return pd.read_csv(path, encoding=enc)
except UnicodeDecodeError as e:
last_exc = e
continue
except Exception as e:
# If it's not a decode error, bubble up immediately
raise
# Fallback with python engine ignoring bad lines if all decodes failed
return pd.read_csv(path, encoding="latin1", engine="python")
def load_features(common_dir: str = None) -> pd.DataFrame:
path = resolve_features_path(common_dir)
if not os.path.exists(path):
raise FileNotFoundError(f"features.csv not found at {path}")
# Comma-delimited, result of EA writes (encoding may vary depending on MT5 settings)
df = _read_csv_robust(path)
# Normalize columns to lower snake-case (handles header casing/spacing)
df.columns = [str(c).strip().lower().replace(" ", "_").replace("-", "_") for c in df.columns]
# Basic cleaning
df = df.dropna(how="all")
# Ensure timestamp is parsed if present
for col in ("time", "timestamp", "entry_time", "close_time"):
if col in df.columns:
try:
df[col] = pd.to_datetime(df[col])
except Exception:
pass
# If long format (feature/value), pivot to wide
if {"feature", "value"}.issubset(set(df.columns)):
# Coerce value to numeric
df["value"] = pd.to_numeric(df["value"], errors="coerce")
# Candidate keys to keep identity and slices stable
key_priority = [
["position_id"],
["deal_id"],
["order_id"],
["position_id", "deal_id", "order_id"],
["strategy", "symbol", "timeframe", "entry_time", "close_time"],
["strategy", "symbol", "entry_time", "close_time"],
["symbol", "entry_time", "close_time"],
]
keys = []
for cand in key_priority:
kk = [c for c in cand if c in df.columns]
if len(kk) == len(cand):
keys = kk
break
# Always try to include context columns for downstream use
for c in ("strategy", "symbol", "timeframe", "entry_time", "close_time"):
if c in df.columns and c not in keys:
keys.append(c)
if not keys:
# Fallback: use all non-feature columns except 'value' as keys to avoid data loss
keys = [c for c in df.columns if c not in ("feature", "value")]
# Pivot
wide = df.pivot_table(index=keys, columns="feature", values="value", aggfunc="last")
wide = wide.reset_index()
# Flatten columns after pivot
wide.columns = [str(c).strip().lower().replace(" ", "_") for c in wide.columns]
return wide
return df
def make_label_from_r_multiple(df: pd.DataFrame, threshold: float = 0.0) -> Tuple[pd.DataFrame, pd.Series]:
# Accept common variants
candidates = [
"r_multiple",
"r",
"rmultiple",
]
col = None
for c in candidates:
if c in df.columns:
col = c
break
if col is None:
# Try fuzzy search for any col containing both 'r' and 'multiple'
for c in df.columns:
s = str(c)
if "r" in s and "multiple" in s:
col = c
break
if col is None:
# Fallback 1: profit column present in the same frame
if "profit" in df.columns:
vals = pd.to_numeric(df["profit"], errors="coerce").fillna(0.0)
y = (vals > 0.0).astype(int)
return df, y
# Fallback 2: try to read knowledge_base.csv and merge
kb_path = os.path.join(os.path.dirname(resolve_features_path()), "knowledge_base.csv")
if os.path.exists(kb_path):
try:
kb = pd.read_csv(kb_path)
kb.columns = [str(c).strip().lower().replace(" ", "_").replace("-", "_") for c in kb.columns]
# Candidate join keys
join_keys_priority = [
["order_id"],
["deal_id"],
["position_id"],
["symbol", "close_time"],
]
for keys in join_keys_priority:
if all(k in df.columns for k in keys) and all(k in kb.columns for k in keys):
merged = pd.merge(df, kb[[*keys, "profit"]], on=keys, how="left")
vals = pd.to_numeric(merged["profit"], errors="coerce").fillna(0.0)
y = (vals > 0.0).astype(int)
return merged, y
except Exception:
pass
raise ValueError("features.csv must contain r_multiple (or 'r'); profit-based fallback also unavailable")
vals = pd.to_numeric(df[col], errors="coerce")
y = (vals > threshold).astype(int)
return df, y