Infinite Batch Size Problem

So i have an ai that tries to guess if it should buy hold or sell stocks, but whenever i set the batch size to less than 64 it shows the progressbars but also starts filling the memory infinitly even when set to 1!, but when i set the batch size to something bigger than 64 no matter how big the batch size is it will NEVER fill up the entire ram i even tried 10.000.000.000 batch size and it ran with no issues for about 250 episodes until i stopped the code since an episode took a while. im currently running this on a laptop with a nvidia 4060 in an env on wsl2.

Here is my code if anyone can help or figure out a fix i could maybe start using this code on lambda wich i would love to be able to do

# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
import gym
from gym import spaces
import os
import sys
import io
import random
import logging
import matplotlib.pyplot as plt
import gc
import psutil

mem = psutil.virtual_memory()
print(f"Memory usage: {mem.percent}%")

# Check for available GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth on all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Using GPU: {len(gpus)} available")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found, running on CPU")

# Enable mixed precision to speed up training
tf.keras.mixed_precision.set_global_policy('mixed_float16')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'  # Show all logs (including info)
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

logging.basicConfig(
    filename='training.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

logging.info("Script started.")

# Simple AI trading environment
class TradingEnv(gym.Env):
    def __init__(self, df, initial_balance=10000, window_size=120, risk_per_trade=1, stop_loss_percent=1):
        super(TradingEnv, self).__init__()
        self.df = df
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = self.initial_balance
        self.num_shares = 0
        self.window_size = window_size

        # Action space: 0 = Hold, 1 = Buy, 2 = Sell
        self.action_space = spaces.Discrete(3)
        # Observation space: A window of previous prices and current balance
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(window_size, 120), dtype=np.float32)

        # Risk management parameters
        self.risk_per_trade = risk_per_trade  # Risk percentage per trade
        self.stop_loss_percent = stop_loss_percent  # Stop-loss percentage

    def reset(self):
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.num_shares = 0
        self.entry_price = 0  # Track entry price for stop-loss
        return self._next_observation()

    def _next_observation(self):
        window = self.df.iloc[self.current_step - self.window_size:self.current_step][['Open', 'High', 'Low', 'Close', 'Volume']].values
        balance_data = np.array([[self.balance]] * self.window_size)
        return np.concatenate((window, balance_data), axis=1).astype(np.float32)

    def step(self, action):
        current_price = self.df.iloc[self.current_step]['Close']
        reward = 0

        if action == 1:  # Buy
            # Calculate position size based on risk management
            risk_amount = self.balance * self.risk_per_trade  # Amount willing to risk
            shares_to_buy = int(risk_amount / current_price)  # Calculate shares based on risk
            if self.balance >= shares_to_buy * current_price:
                self.num_shares += shares_to_buy
                self.balance -= shares_to_buy * current_price
                self.entry_price = current_price  # Set entry price for stop-loss

        elif action == 2:  # Sell
            if self.num_shares > 0:
                reward = self.num_shares * (current_price - self.entry_price)  # Profit calculation
                self.balance += self.num_shares * current_price
                self.num_shares = 0

        # Check for stop-loss condition
        if self.num_shares > 0 and current_price < self.entry_price * (1 - self.stop_loss_percent):
            # Execute stop-loss
            reward = self.num_shares * (current_price - self.entry_price)  # Loss calculation
            self.balance += self.num_shares * current_price
            self.num_shares = 0  # Sell all shares at stop-loss
            print(f"Stop-loss triggered at step {self.current_step}, selling shares.")

        # Move to the next step
        self.current_step += 1
        done = self.current_step >= len(self.df) - 1

        return self._next_observation(), reward, done, {}

# Function to build the LSTM model with added dropout layers
def build_lstm_model(window_size, action_shape):
    model = tf.keras.Sequential()
    model.add(layers.LSTM(256, return_sequences=True, input_shape=(window_size, 6)))
    model.add(layers.LSTM(128, return_sequences=False))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(96, activation='relu'))
    model.add(layers.Dense(48, activation='relu'))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dense(action_shape, activation='linear'))
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))
    return model

# Function to save training parameters like epsilon
def save_training_parameters(epsilon, filename='training_parameters.csv'):
    params = pd.DataFrame({
        'epsilon': [epsilon]
    })
    params.to_csv(filename, index=False)

# Function to load training parameters like epsilon
def load_training_parameters(filename='training_parameters.csv'):
    if os.path.exists(filename):
        params = pd.read_csv(filename)
        if 'epsilon' in params.columns:
            return params['epsilon'][0]
    return 1.0  # Default value for epsilon if no saved file exists

# Model opslaan
def save_model(model, path='trading_model.keras'):
    model.save(path)  # Save in the new Keras format

# Model laden
def load_model(path='trading_model.keras'):
    return tf.keras.models.load_model(path)

# Status opslaan (balans, aandelenpositie)
def save_status(balance, num_shares, step, filename='trading_status.csv'):
    status = pd.DataFrame({
        'balance': [balance],
        'num_shares': [num_shares],
        'step': [step]
    })
    status.to_csv(filename, index=False)

# Status laden
def load_status(filename='trading_status.csv'):
    if os.path.exists(filename):
        status = pd.read_csv(filename)
        return status['balance'][0], status['num_shares'][0], status['step'][0]
    else:
        return 10000, 0, 0  # Beginwaardes

# Historische data laden
def load_data(csv_file):
    df = pd.read_csv(csv_file)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)  # Set 'timestamp' as index

    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]  # Ignore 'Adj Close'
    return df

# New function to pick a random file from a folder
def pick_random_file(folder):
    files = os.listdir(folder)  # Get all files in the folder
    csv_files = [f for f in files if f.endswith('.csv')]  # Filter for CSV files
    if not csv_files:
        raise Exception("No CSV files found in the folder.")
    return os.path.join(folder, random.choice(csv_files))  # Choose a random file and return its path

# Constants
BATCH_SIZE = 1000000000  # Batch size for training
GAMMA = 0.95  # Discount factor for future rewards

# Define the training step function outside of the main loop
def train_step(model, states_array, q_values):
    return model.train_on_batch(states_array, q_values)

def main():
    print("Starting the training process...", flush=True)
    checkpoint_filepath = 'best_model.weights.h5'
    model_checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='loss',
        mode='min',
        save_best_only=True
    )

    episode_count = 0
    csv_change_frequency = 100  # Change CSV every 100 episodes
    folder_path = 'DATA'
    
    # Load the first CSV file
    current_csv = pick_random_file(folder_path)
    df = load_data(current_csv)
    env = TradingEnv(df)

    # Log the first CSV file being used
    logging.info(f"Using CSV: {current_csv} for the first 100 episodes.")

    # Load previous training parameters if they exist
    epsilon = load_training_parameters()
    
    # Load or initialize model
    model = None
    if os.path.exists('trading_model.keras'):
        model = load_model('trading_model.keras')
        print("Loaded existing model.", flush=True)
    else:
        model = build_lstm_model(env.window_size, env.action_space.n)
        print("Created new model.", flush=True)

    # Initialize replay memory
    replay_memory = []

    print(f"Started Training", flush=True)

    # Training loop
    for episode in range(10000):  # Number of episodes for training
        episode_count += 1

        # Switch CSV file every 100 episodes
        if episode_count % csv_change_frequency == 0:
            current_csv = pick_random_file(folder_path)  # Pick a new CSV
            df = load_data(current_csv)
            env = TradingEnv(df)
            logging.info(f"Switched to new CSV: {current_csv} at episode {episode + 1}")

        print(f"Starting episode {episode + 1}...", flush=True)
        state = env.reset()
        done = False
        total_reward = 0
        step_count = 0  # Initialize step counter

        while not done:
            step_count += 1  # Increment step counter
            # Epsilon-greedy action selection
            if np.random.rand() < epsilon:
                action = env.action_space.sample()  # Explore
            else:
                q_values = model.predict(state.reshape(1, env.window_size, 6), verbose=0)
                action = np.argmax(q_values[0])  # Exploit

            next_state, reward, done, _ = env.step(action)
            replay_memory.append((state, action, reward, next_state, done))

            if len(replay_memory) > 128:
                replay_memory.pop(0)  # Maintain the replay memory size

            # Update the state
            state = next_state
            total_reward += reward

            # Training the model with samples from the replay memory
            if len(replay_memory) >= BATCH_SIZE:
                batch = random.sample(replay_memory, BATCH_SIZE)
                states_array = np.array([item[0] for item in batch])
                actions_array = np.array([item[1] for item in batch])
                rewards_array = np.array([item[2] for item in batch])
                next_states_array = np.array([item[3] for item in batch])
                dones_array = np.array([item[4] for item in batch])

                # Q-value calculation
                q_values_next = model.predict(next_states_array)
                q_values_target = rewards_array + GAMMA * np.max(q_values_next, axis=1) * (1 - dones_array)

                # Update Q-values
                q_values = model.predict(states_array)
                q_values[np.arange(BATCH_SIZE), actions_array] = q_values_target

                # Train the model
                train_step(model, states_array, q_values)  # Call the training step

        # Update epsilon value after each episode
        epsilon = max(0.01, epsilon * 0.9999)

        # Save model and status
        save_model(model)
        save_training_parameters(epsilon)
        save_status(env.balance, env.num_shares, episode)

        # Print progress
        logging.info(f"Episode {episode + 1} finished in {step_count} steps with total reward: {total_reward}, Epsilon: {epsilon}, Using CSV: {current_csv}")
        gc.collect()

# Run the main function
if __name__ == "__main__":
    main()

the csv is formatted like this

timestamp,Open,High,Low,Close,Adj Close,Volume