Step 7: Model Testing & Optimization

After developing our resume scoring system, we need to evaluate its performance and make adjustments to improve its accuracy. This step is crucial to ensure our Resume Parser AI works effectively in real-world scenarios.

Why Testing and Optimization Matter

Even the most carefully designed models need testing and refinement. Testing helps us:

Identify and fix bugs or errors
Ensure the model works as expected
Measure the model's performance
Find opportunities for improvement

Preparing Test Data

To properly test our model, we need a separate test dataset that wasn't used during development:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Split our data into training and testing sets
train_data, test_data = train_test_split(resume_data, test_size=0.2, random_state=42)

print(f"Training data size: {len(train_data)}")
print(f"Testing data size: {len(test_data)}")

# Save the test data for future reference
test_data.to_csv('data/test_data.csv', index=False)

Evaluating Our Scoring System

Since we don't have "ground truth" labels for the ideal scores, we'll need to evaluate our system in different ways:

1. Consistency Check

First, let's check if our scoring system is consistent:


import matplotlib.pyplot as plt
import seaborn as sns

# Choose a job title to evaluate
job_title = 'Data Scientist'
score_column = f'score_{job_title.replace(" ", "_")}'

# Calculate scores for test data
requirements = job_requirements[job_title]
test_data[score_column] = test_data.apply(
    lambda row: score_resume(
        row, 
        job_title, 
        requirements['required_skills'],
        requirements['preferred_skills'],
        requirements['min_experience'],
        requirements['education_level']
    ), 
    axis=1
)

# Check correlation between score and key features
correlation_features = ['years_experience']
for skill in requirements['required_skills']:
    test_data[f'has_{skill}'] = test_data['skills'].apply(lambda x: 1 if skill in x else 0)
    correlation_features.append(f'has_{skill}')

correlation_data = test_data[correlation_features + [score_column]]
correlation_matrix = correlation_data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title(f'Correlation between Features and {job_title} Score')
plt.tight_layout()
plt.savefig('results/score_correlation.png')
plt.close()

# Check if scores align with our expectations
print("\nTop 5 candidates by score:")
top_candidates = test_data.sort_values(by=score_column, ascending=False).head(5)
for idx, row in top_candidates.iterrows():
    print(f"Candidate {idx}: Score {row[score_column]}")
    print(f"  Experience: {row['years_experience']} years")
    print(f"  Education: {', '.join(row['education'])}")
    print(f"  Skills: {', '.join(row['skills'])}")
    print(f"  Required skills match: {sum(1 for skill in requirements['required_skills'] if skill in row['skills'])}/{len(requirements['required_skills'])}")
    print()

print("\nBottom 5 candidates by score:")
bottom_candidates = test_data.sort_values(by=score_column).head(5)
for idx, row in bottom_candidates.iterrows():
    print(f"Candidate {idx}: Score {row[score_column]}")
    print(f"  Experience: {row['years_experience']} years")
    print(f"  Education: {', '.join(row['education'])}")
    print(f"  Skills: {', '.join(row['skills'])}")
    print(f"  Required skills match: {sum(1 for skill in requirements['required_skills'] if skill in row['skills'])}/{len(requirements['required_skills'])}")
    print()

2. Manual Validation

For a small sample, we can manually review the scores to see if they make sense:


# Select a random sample of 5 resumes
sample_indices = np.random.choice(test_data.index, size=5, replace=False)
sample_resumes = test_data.loc[sample_indices]

print("\nManual validation sample:")
for idx, row in sample_resumes.iterrows():
    print(f"Candidate {idx}: Score {row[score_column]}")
    print(f"  Experience: {row['years_experience']} years")
    print(f"  Education: {', '.join(row['education'])}")
    print(f"  Skills: {', '.join(row['skills'])}")
    print(f"  Required skills match: {sum(1 for skill in requirements['required_skills'] if skill in row['skills'])}/{len(requirements['required_skills'])}")
    
    # Print a snippet of the resume for manual review
    print(f"\nResume snippet:")
    print(row['Resume'][:300] + "...\n")
    print("-" * 80)

3. Cross-Validation with Different Scoring Methods

Let's compare our main scoring method with the weighted scoring method:


# Apply weighted scoring to test data
for profile_name, weights in weight_profiles.items():
    column_name = f'score_{profile_name}'
    test_data[column_name] = test_data.apply(
        lambda row: weighted_score_resume(row, job_title, requirements, weights), 
        axis=1
    )

# Compare the rankings
print("\nRank correlation between different scoring methods:")
score_columns = [score_column] + [f'score_{profile}' for profile in weight_profiles.keys()]
rank_correlation = test_data[score_columns].corr(method='spearman')
print(rank_correlation)

# Visualize the comparison
plt.figure(figsize=(10, 8))
sns.heatmap(rank_correlation, annot=True, cmap='viridis')
plt.title('Rank Correlation between Different Scoring Methods')
plt.tight_layout()
plt.savefig('results/scoring_method_comparison.png')
plt.close()

Optimizing Our Model

Based on our evaluation, we can make adjustments to improve our scoring system:

1. Tuning the Weights

Let's try to find the optimal weights for our scoring system:


from itertools import product

# Define a range of weights to try
weight_options = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

# Generate weight combinations that sum to 1
valid_weight_combinations = []
for weights in product(weight_options, repeat=5):
    if sum(weights) == 1.0:
        valid_weight_combinations.append({
            'skills': weights[0],
            'experience': weights[1],
            'education': weights[2],
            'relevance': weights[3],
            'keywords': weights[4]
        })

print(f"Testing {len(valid_weight_combinations)} weight combinations")

# Function to evaluate a weight combination
def evaluate_weights(weights, data, job_title, requirements):
    # Calculate scores using these weights
    scores = data.apply(
        lambda row: weighted_score_resume(row, job_title, requirements, weights), 
        axis=1
    )
    
    # Calculate metrics (for this example, we'll use correlation with required skills match)
    req_skill_match = data['skills'].apply(
        lambda skills: sum(1 for skill in requirements['required_skills'] if skill in skills) / len(requirements['required_skills'])
    )
    
    correlation = scores.corr(req_skill_match)
    
    return correlation

# Test a subset of weight combinations (for efficiency)
test_combinations = valid_weight_combinations[:10]  # In practice, test more
results = []

for weights in test_combinations:
    correlation = evaluate_weights(weights, test_data, job_title, requirements)
    results.append({
        'weights': weights,
        'correlation': correlation
    })

# Find the best weights
results.sort(key=lambda x: x['correlation'], reverse=True)
best_weights = results[0]['weights']

print("\nBest weights found:")
for key, value in best_weights.items():
    print(f"{key}: {value}")
print(f"Correlation: {results[0]['correlation']}")

2. Feature Importance Analysis

Let's analyze which features contribute most to the scores:


import matplotlib.pyplot as plt

# Calculate feature importance for required skills
skill_importance = {}
for skill in requirements['required_skills']:
    # Create two groups: resumes with and without this skill
    with_skill = test_data[test_data['skills'].apply(lambda skills: skill in skills)]
    without_skill = test_data[test_data['skills'].apply(lambda skills: skill not in skills)]
    
    # Calculate average score difference
    avg_with = with_skill[score_column].mean()
    avg_without = without_skill[score_column].mean()
    importance = avg_with - avg_without
    
    skill_importance[skill] = importance

# Plot skill importance
plt.figure(figsize=(10, 6))
skills = list(skill_importance.keys())
importances = list(skill_importance.values())
sorted_indices = np.argsort(importances)
plt.barh([skills[i] for i in sorted_indices], [importances[i] for i in sorted_indices])
plt.xlabel('Score Difference')
plt.title('Skill Importance for Data Scientist Role')
plt.tight_layout()
plt.savefig('results/skill_importance.png')
plt.close()

print("\nSkill importance (average score difference):")
for skill, importance in sorted(skill_importance.items(), key=lambda x: x[1], reverse=True):
    print(f"{skill}: {importance:.2f}")

3. Adjusting the Algorithm

Based on our analysis, let's refine our scoring algorithm:


def optimized_score_resume(resume_row, job_title, requirements, best_weights):
    """
    Optimized version of our scoring function based on testing results
    """
    # Start with the weighted scoring function
    base_score = weighted_score_resume(resume_row, job_title, requirements, best_weights)
    
    # Add any refinements based on our analysis
    # For example, we might add bonuses for specific skill combinations
    
    # Check for complementary skills
    skills = resume_row['skills']
    if 'Python' in skills and 'Machine Learning' in skills and job_title == 'Data Scientist':
        base_score += 0.5  # Bonus for having this powerful combination
    
    # Adjust for skill recency (if we had this data)
    # This would require additional extraction from the resume
    
    # Cap the score at 10
    return min(10, base_score)

# Apply our optimized scoring
test_data['optimized_score'] = test_data.apply(
    lambda row: optimized_score_resume(row, job_title, requirements, best_weights), 
    axis=1
)

# Compare with original score
plt.figure(figsize=(10, 6))
plt.scatter(test_data[score_column], test_data['optimized_score'], alpha=0.5)
plt.plot([0, 10], [0, 10], 'r--')  # Diagonal line
plt.xlabel('Original Score')
plt.ylabel('Optimized Score')
plt.title('Comparison of Original and Optimized Scores')
plt.tight_layout()
plt.savefig('results/score_optimization.png')
plt.close()

# Check if top candidates changed
print("\nTop 5 candidates by optimized score:")
top_optimized = test_data.sort_values(by='optimized_score', ascending=False).head(5)
for idx, row in top_optimized.iterrows():
    print(f"Candidate {idx}: Original Score {row[score_column]}, Optimized Score {row['optimized_score']}")
    print(f"  Skills: {', '.join(row['skills'])}")

Saving the Optimized Model

Let's save our optimized model for future use:


import pickle

# Create a dictionary with our optimized scoring function and parameters
optimized_model = {
    'score_resume': optimized_score_resume,
    'job_requirements': job_requirements,
    'best_weights': best_weights
}

# Save the model
with open('models/optimized_scoring_model.pkl', 'wb') as f:
    pickle.dump(optimized_model, f)

print("Optimized scoring model saved to models/optimized_scoring_model.pkl")

Continuous Improvement

In a real-world scenario, you would continue to refine your model based on:

Feedback from recruiters who use the system
Performance metrics tracking which candidates were actually hired
A/B testing different scoring algorithms
New data as more resumes are processed

Model testing and optimization is an iterative process. As you gather more data and feedback, you can continuously improve your Resume Parser AI to make it more accurate and useful.

Next Steps

Now that we've tested and optimized our Resume Parser AI, we're ready for the final step: deploying our model as a simple web application that recruiters can use to upload and analyze resumes.

Step 6: Scoring System Step 8: Deployment