Upload 17 files

7f90ea0 verified 3 months ago

14.2 kB

	"""
	Demand Prediction System - Prediction Script

	This script loads a trained model and makes demand predictions for products
	on future dates. Supports both ML models and time-series models (ARIMA, Prophet).

	Usage (ML Models):
	python predict.py --product_id 1 --date 2024-01-15 --price 100 --discount 10 --category Electronics

	Usage (Time-Series Models - overall demand):
	python predict.py --date 2024-01-15 --model_type timeseries
	"""

	import pandas as pd
	import numpy as np
	import joblib
	import json
	import argparse
	from datetime import datetime
	import os
	import warnings
	warnings.filterwarnings('ignore')

	# Configuration
	MODEL_DIR = 'models'
	MODEL_PATH = f'{MODEL_DIR}/best_model.joblib'
	TS_MODEL_PATH = f'{MODEL_DIR}/best_timeseries_model.joblib'
	PREPROCESSING_PATH = f'{MODEL_DIR}/preprocessing.joblib'
	METADATA_PATH = f'{MODEL_DIR}/model_metadata.json'
	ALL_MODELS_METADATA_PATH = f'{MODEL_DIR}/all_models_metadata.json'


	def load_model_and_preprocessing(model_type='auto'):
	"""
	Load the trained model and preprocessing objects.

	Args:
	model_type: 'ml', 'timeseries', or 'auto' (auto-detect best model)

	Returns:
	tuple: (model, preprocessing_data, model_name, is_timeseries)
	"""
	# Load metadata to determine best model
	if os.path.exists(ALL_MODELS_METADATA_PATH):
	with open(ALL_MODELS_METADATA_PATH, 'r') as f:
	all_metadata = json.load(f)
	best_model_name = all_metadata.get('best_model', 'Unknown')
	else:
	best_model_name = None

	# Determine which model to use
	if model_type == 'auto':
	if best_model_name in ['ARIMA', 'Prophet']:
	model_type = 'timeseries'
	else:
	model_type = 'ml'

	is_timeseries = (model_type == 'timeseries')

	if is_timeseries:
	# Load time-series model
	if not os.path.exists(TS_MODEL_PATH):
	raise FileNotFoundError(
	f"Time-series model not found at {TS_MODEL_PATH}. Please run train_model.py first."
	)

	print("Loading time-series model...")
	model = joblib.load(TS_MODEL_PATH)
	preprocessing_data = None

	if best_model_name:
	print(f"Model: {best_model_name}")
	if best_model_name in all_metadata.get('all_models', {}):
	metrics = all_metadata['all_models'][best_model_name]
	print(f"R2 Score: {metrics.get('r2', 'N/A'):.4f}")

	return model, preprocessing_data, best_model_name or 'Time-Series', True
	else:
	# Load ML model
	if not os.path.exists(MODEL_PATH):
	raise FileNotFoundError(
	f"ML model not found at {MODEL_PATH}. Please run train_model.py first."
	)

	if not os.path.exists(PREPROCESSING_PATH):
	raise FileNotFoundError(
	f"Preprocessing objects not found at {PREPROCESSING_PATH}. Please run train_model.py first."
	)

	print("Loading ML model and preprocessing objects...")
	model = joblib.load(MODEL_PATH)
	preprocessing_data = joblib.load(PREPROCESSING_PATH)

	# Load metadata if available
	if os.path.exists(METADATA_PATH):
	with open(METADATA_PATH, 'r') as f:
	metadata = json.load(f)
	model_name = metadata.get('model_name', 'ML Model')
	print(f"Model: {model_name}")
	print(f"R2 Score: {metadata.get('metrics', {}).get('r2', 'N/A'):.4f}")
	else:
	model_name = best_model_name or 'ML Model'

	return model, preprocessing_data, model_name, False


	def prepare_features(product_id, date, price, discount, category, preprocessing_data):
	"""
	Prepare features for prediction using the same preprocessing pipeline.

	Args:
	product_id: Product ID
	date: Date string (YYYY-MM-DD) or datetime object
	price: Product price
	discount: Discount percentage (0-100)
	category: Product category
	preprocessing_data: Dictionary containing encoders and scaler

	Returns:
	numpy array: Prepared features for prediction
	"""
	# Convert date to datetime if string
	if isinstance(date, str):
	date = pd.to_datetime(date)

	# Extract date features (same as in training)
	day = date.day
	month = date.month
	day_of_week = date.weekday() # 0=Monday, 6=Sunday
	weekend = 1 if day_of_week >= 5 else 0
	year = date.year
	quarter = date.quarter

	# Encode categorical variables
	category_encoder = preprocessing_data['encoders']['category']
	product_encoder = preprocessing_data['encoders']['product_id']

	# Handle unseen categories/products
	try:
	category_encoded = category_encoder.transform([category])[0]
	except ValueError:
	# If category not seen during training, use most common category
	print(f"Warning: Category '{category}' not seen during training. Using default encoding.")
	category_encoded = 0

	try:
	product_id_encoded = product_encoder.transform([product_id])[0]
	except ValueError:
	# If product_id not seen during training, use mean encoding
	print(f"Warning: Product ID '{product_id}' not seen during training. Using default encoding.")
	product_id_encoded = product_encoder.transform([product_encoder.classes_[0]])[0]

	# Create feature dictionary
	feature_dict = {
	'price': price,
	'discount': discount,
	'day': day,
	'month': month,
	'day_of_week': day_of_week,
	'weekend': weekend,
	'year': year,
	'quarter': quarter,
	'category_encoded': category_encoded,
	'product_id_encoded': product_id_encoded
	}

	# Create feature array in the same order as training
	feature_names = preprocessing_data['feature_names']
	features = np.array([[feature_dict[name] for name in feature_names]])

	# Scale features
	scaler = preprocessing_data['scaler']
	features_scaled = scaler.transform(features)

	return features_scaled


	def predict_demand_ml(product_id, date, price, discount, category, model, preprocessing_data):
	"""
	Predict demand for a product on a given date using ML model.

	Args:
	product_id: Product ID
	date: Date string (YYYY-MM-DD) or datetime object
	price: Product price
	discount: Discount percentage (0-100)
	category: Product category
	model: Trained ML model
	preprocessing_data: Dictionary containing encoders and scaler

	Returns:
	float: Predicted sales quantity
	"""
	# Prepare features
	features = prepare_features(product_id, date, price, discount, category, preprocessing_data)

	# Make prediction
	prediction = model.predict(features)[0]

	# Ensure non-negative prediction
	prediction = max(0, prediction)

	return prediction


	def predict_demand_timeseries(date, model, model_name):
	"""
	Predict overall daily demand using time-series model.

	Args:
	date: Date string (YYYY-MM-DD) or datetime object
	model: Trained time-series model (ARIMA or Prophet)
	model_name: Name of the model ('ARIMA' or 'Prophet')

	Returns:
	float: Predicted total daily sales quantity
	"""
	# Convert date to datetime if string
	if isinstance(date, str):
	date = pd.to_datetime(date)

	if model_name == 'ARIMA':
	# For ARIMA, we need to calculate how many steps ahead
	# This is a simplified approach - in practice, you'd need the training end date
	# For now, predict 1 step ahead
	try:
	forecast = model.forecast(steps=1)
	prediction = forecast[0] if hasattr(forecast, '__iter__') else forecast
	prediction = max(0, prediction)
	return prediction
	except Exception as e:
	print(f"Error in ARIMA prediction: {e}")
	return None

	elif model_name == 'Prophet':
	# For Prophet, create a future dataframe
	try:
	future = pd.DataFrame({'ds': [date]})
	forecast = model.predict(future)
	prediction = forecast['yhat'].iloc[0]
	prediction = max(0, prediction)
	return prediction
	except Exception as e:
	print(f"Error in Prophet prediction: {e}")
	return None

	else:
	print(f"Unknown time-series model: {model_name}")
	return None


	def predict_batch(predictions_data, model, preprocessing_data):
	"""
	Predict demand for multiple products/dates at once.

	Args:
	predictions_data: List of dictionaries, each containing:
	- product_id
	- date
	- price
	- discount
	- category
	model: Trained model
	preprocessing_data: Dictionary containing encoders and scaler

	Returns:
	list: List of predicted sales quantities
	"""
	predictions = []

	for data in predictions_data:
	pred = predict_demand(
	data['product_id'],
	data['date'],
	data['price'],
	data['discount'],
	data['category'],
	model,
	preprocessing_data
	)
	predictions.append(pred)

	return predictions


	def main():
	"""
	Main function for command-line interface.
	"""
	parser = argparse.ArgumentParser(
	description='Predict product demand for a given date and product details',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples (ML Models):
	python predict.py --product_id 1 --date 2024-01-15 --price 100 --discount 10 --category Electronics
	python predict.py --product_id 5 --date 2024-06-20 --price 50 --discount 0 --category Clothing

	Examples (Time-Series Models - overall daily demand):
	python predict.py --date 2024-01-15 --model_type timeseries
	"""
	)

	parser.add_argument('--product_id', type=int, default=None,
	help='Product ID (required for ML models)')
	parser.add_argument('--date', type=str, required=True,
	help='Date in YYYY-MM-DD format')
	parser.add_argument('--price', type=float, default=None,
	help='Product price (required for ML models)')
	parser.add_argument('--discount', type=float, default=0,
	help='Discount percentage (0-100), default: 0 (for ML models)')
	parser.add_argument('--category', type=str, default=None,
	help='Product category (required for ML models)')
	parser.add_argument('--model_type', type=str, default='auto',
	choices=['auto', 'ml', 'timeseries'],
	help='Model type to use: auto (best model), ml, or timeseries')

	args = parser.parse_args()

	# Validate date format
	try:
	date_obj = pd.to_datetime(args.date)
	except ValueError:
	print(f"Error: Invalid date format '{args.date}'. Please use YYYY-MM-DD format.")
	return

	# Load model and preprocessing
	try:
	model, preprocessing_data, model_name, is_timeseries = load_model_and_preprocessing(args.model_type)
	except FileNotFoundError as e:
	print(f"Error: {e}")
	return

	# Validate arguments based on model type
	if not is_timeseries:
	# ML model requires product details
	if args.product_id is None or args.price is None or args.category is None:
	print("Error: ML models require --product_id, --price, and --category arguments.")
	return

	# Validate discount range
	if args.discount < 0 or args.discount > 100:
	print(f"Warning: Discount {args.discount}% is outside 0-100 range. Clamping to valid range.")
	args.discount = max(0, min(100, args.discount))

	# Make prediction
	print("\n" + "="*60)
	print("MAKING PREDICTION")
	print("="*60)
	print(f"Model: {model_name}")
	print(f"Model Type: {'Time-Series' if is_timeseries else 'Machine Learning'}")
	print(f"Date: {args.date}")

	if not is_timeseries:
	print(f"Product ID: {args.product_id}")
	print(f"Price: ${args.price:.2f}")
	print(f"Discount: {args.discount}%")
	print(f"Category: {args.category}")

	print("-"*60)

	if is_timeseries:
	predicted_demand = predict_demand_timeseries(
	args.date,
	model,
	model_name
	)

	if predicted_demand is None:
	print("Error: Failed to make prediction.")
	return

	print(f"\nPredicted Total Daily Sales Quantity: {predicted_demand:.0f} units")
	print("(This is the predicted total demand across all products for this date)")
	else:
	predicted_demand = predict_demand_ml(
	args.product_id,
	args.date,
	args.price,
	args.discount,
	args.category,
	model,
	preprocessing_data
	)

	print(f"\nPredicted Sales Quantity: {predicted_demand:.0f} units")
	print("(This is the predicted demand for this specific product)")

	print("="*60)

	# Additional information
	date_obj = pd.to_datetime(args.date)
	day_name = date_obj.strftime('%A')
	is_weekend = "Yes" if date_obj.weekday() >= 5 else "No"

	print(f"\nDate Information:")
	print(f" Day of week: {day_name}")
	print(f" Weekend: {is_weekend}")
	print(f" Month: {date_obj.strftime('%B')}")
	print(f" Quarter: Q{date_obj.quarter}")


	if __name__ == "__main__":
	main()