Spaces:
Sleeping
Sleeping
| from process_xlsx import process_xlsx | |
| import pandas as pd | |
| import re | |
| # Step 1: Convert to DataFrame | |
| def create_course_dataframe(cleaned_column_names, column_names, department_program_courses): | |
| data = [] | |
| for department, programs in department_program_courses.items(): | |
| for program, courses in programs.items(): | |
| for course in courses: | |
| row_data = course[:] # Copy original row | |
| row_data.append(department) # Add Department column | |
| row_data.append(program if program else "N/A") # Add Program column | |
| data.append(row_data) | |
| # Add Department and Program to the column names | |
| extended_column_names = column_names + ['Department', 'Program'] | |
| # Create DataFrame | |
| df = pd.DataFrame(data, columns=extended_column_names) | |
| # Strip trailing spaces from column names and remove spaces/newlines | |
| df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '') | |
| # Ensure all column names are strings before stripping | |
| df.rename(columns=lambda x: str(x).strip(), inplace=True) | |
| # Clean the cleaned_column_names to match the stripped column names | |
| cleaned_column_names = [col.strip() for col in cleaned_column_names] | |
| # Select columns based on cleaned_column_names | |
| df = df[cleaned_column_names] | |
| return df | |
| # Step 2: Diagnose Inconsistencies in Data | |
| def diagnose_inconsistencies(df): | |
| # Report missing values | |
| missing_values = df.isnull().sum() | |
| print("\nMissing Values Per Column:") | |
| print(missing_values[missing_values > 0]) | |
| # Check unique value counts to spot potential inconsistencies | |
| print("\nUnique Value Counts Per Column:") | |
| for column in df.columns: | |
| unique_vals = df[column].nunique() | |
| print(f"{column}: {unique_vals} unique values") | |
| # Identify potential misspellings and inconsistent values in key columns | |
| # Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc. | |
| print("\nInconsistent Patterns and Values:") | |
| # Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521') | |
| inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')] | |
| if not inconsistent_course_codes.empty: | |
| print("\nInconsistent Course Codes:") | |
| print(inconsistent_course_codes[['CourseCode']].drop_duplicates()) | |
| # Check for inconsistent capitalization in 'Instructor' column | |
| df['Instructor'] = df['Instructor'].str.strip().str.title() | |
| instructor_inconsistencies = df['Instructor'].value_counts() | |
| print("\nInstructor Inconsistencies:") | |
| print(instructor_inconsistencies[instructor_inconsistencies > 1]) | |
| # Check for possible misspellings or variations in Room | |
| print("\nRoom Variations:") | |
| room_variations = df['Room'].value_counts() | |
| print(room_variations[room_variations > 1]) | |
| # Identify rows with missing key fields that should generally be non-null | |
| key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor'] | |
| missing_key_fields = df[df[key_columns].isnull().any(axis=1)] | |
| if not missing_key_fields.empty: | |
| print("\nRows with Missing Key Fields:") | |
| print(missing_key_fields[key_columns]) | |
| # Display data types and any anomalies in numeric fields | |
| print("\nData Types and Anomalies in Numeric Fields:") | |
| for column in df.select_dtypes(include=['number']).columns: | |
| print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}") | |
| return df | |
| file_path = "data/FTCM_Course_List_Spring2025.xlsx" | |
| result = process_xlsx(file_path) | |
| if result: | |
| column_names, department_program_courses = result | |
| print(f"Column Names:{column_names}") | |
| else: | |
| print(f"Error processing file. {file_path}") | |
| cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)', | |
| 'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo', | |
| 'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo', | |
| 'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program'] | |
| # Sample usage | |
| # Assuming column_names and department_program_courses are already defined | |
| df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses) | |
| df_cleaned = diagnose_inconsistencies(df) | |
| diagnose_inconsistencies(df_cleaned) |