Step 5: Contextual Analysis

After clustering skills, we need to understand how words are used in context within resumes. This is where contextual analysis comes in, helping us extract more meaningful information from the text.

What is Contextual Analysis?

Contextual analysis involves understanding words based on their surrounding context rather than treating them as isolated terms. For resumes, this is crucial because:

Named Entity Recognition (NER)

Named Entity Recognition is a technique that identifies and classifies named entities in text into predefined categories such as person names, organizations, locations, time expressions, quantities, etc.

For our resume parser, we'll use spaCy, a powerful NLP library, to implement NER:


import spacy
import pandas as pd
from tqdm import tqdm

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a function to extract entities
def extract_entities(text):
    doc = nlp(text)
    entities = {}
    
    # Extract standard entities
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = []
        entities[ent.label_].append(ent.text)
    
    return entities

# Apply to our dataset (this may take some time)
entities_list = []
for resume in tqdm(resume_data['Resume']):
    # Process only the first 5000 characters to speed things up
    entities = extract_entities(resume[:5000])
    entities_list.append(entities)

# Add to our dataset
resume_data['entities'] = entities_list

# Let's see what types of entities we've extracted
entity_types = set()
for entities in entities_list:
    entity_types.update(entities.keys())

print("Entity types found:", entity_types)

# Count the most common organizations mentioned
all_orgs = []
for entities in entities_list:
    if 'ORG' in entities:
        all_orgs.extend(entities['ORG'])

from collections import Counter
org_counter = Counter(all_orgs)
print("\nTop 10 organizations mentioned:")
for org, count in org_counter.most_common(10):
    print(f"{org}: {count}")
            

Custom Entity Recognition for Resumes

The standard NER categories might not be sufficient for resumes. Let's create a custom entity recognizer for resume-specific information:


import re

# Function to extract years of experience
def extract_experience(text):
    # Pattern to match experience mentions
    patterns = [
        r'(\d+)\s+years?\s+(?:of\s+)?experience',
        r'experience\s+(?:of\s+)?(\d+)\s+years?',
        r'worked\s+(?:for\s+)?(\d+)\s+years?'
    ]
    
    years = []
    for pattern in patterns:
        matches = re.finditer(pattern, text.lower())
        for match in matches:
            years.append(int(match.group(1)))
    
    return max(years) if years else 0

# Function to extract education level
def extract_education(text):
    education_levels = {
        'phd': ['phd', 'ph.d', 'doctor of philosophy'],
        'masters': ['masters', 'master of', 'ms ', 'msc', 'm.sc', 'ma ', 'm.a'],
        'bachelors': ['bachelor', 'bs ', 'b.s', 'ba ', 'b.a', 'undergraduate'],
        'associate': ['associate', 'a.s', 'a.a'],
        'high school': ['high school', 'secondary school']
    }
    
    text_lower = text.lower()
    found_levels = []
    
    for level, keywords in education_levels.items():
        for keyword in keywords:
            if keyword in text_lower:
                found_levels.append(level)
                break
    
    return found_levels

# Function to extract skills
def extract_skills(text, skill_list):
    text_lower = text.lower()
    found_skills = []
    
    for skill in skill_list:
        if skill.lower() in text_lower:
            found_skills.append(skill)
    
    return found_skills

# Let's create a simple skill list (in a real application, this would be more comprehensive)
common_skills = [
    'Python', 'Java', 'JavaScript', 'C++', 'C#', 'SQL', 'HTML', 'CSS',
    'React', 'Angular', 'Vue', 'Node.js', 'Django', 'Flask',
    'Machine Learning', 'Data Analysis', 'Data Science', 'AI',
    'Project Management', 'Agile', 'Scrum', 'Kanban',
    'Microsoft Office', 'Excel', 'PowerPoint', 'Word',
    'Photoshop', 'Illustrator', 'InDesign', 'Figma',
    'Communication', 'Leadership', 'Teamwork', 'Problem Solving'
]

# Apply our custom extractors
resume_data['years_experience'] = resume_data['Resume'].apply(extract_experience)
resume_data['education'] = resume_data['Resume'].apply(extract_education)
resume_data['skills'] = resume_data['Resume'].apply(lambda x: extract_skills(x, common_skills))

# Display some statistics
print(f"\nAverage years of experience: {resume_data['years_experience'].mean():.2f}")
print("\nEducation level distribution:")
education_counts = {}
for edu_list in resume_data['education']:
    for edu in edu_list:
        education_counts[edu] = education_counts.get(edu, 0) + 1
for edu, count in sorted(education_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{edu}: {count}")

print("\nTop skills:")
skill_counts = {}
for skill_list in resume_data['skills']:
    for skill in skill_list:
        skill_counts[skill] = skill_counts.get(skill, 0) + 1
for skill, count in sorted(skill_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
    print(f"{skill}: {count}")
            

Relationship Extraction

Beyond identifying entities, we can also extract relationships between them. For example, we might want to know which skills a candidate used at which company:


import networkx as nx
import matplotlib.pyplot as plt

# Function to extract skill-company relationships
def extract_skill_company_relations(resume, skills, companies):
    relations = []
    
    # Simple approach: if a skill and company appear within 100 characters of each other,
    # assume they're related
    resume_lower = resume.lower()
    
    for skill in skills:
        skill_lower = skill.lower()
        skill_pos = resume_lower.find(skill_lower)
        
        if skill_pos != -1:
            for company in companies:
                company_lower = company.lower()
                company_pos = resume_lower.find(company_lower)
                
                if company_pos != -1 and abs(skill_pos - company_pos) < 100:
                    relations.append((skill, company))
    
    return relations

# Apply to a sample of resumes
sample_size = min(50, len(resume_data))
sample_data = resume_data.sample(sample_size, random_state=42)

all_relations = []
for idx, row in sample_data.iterrows():
    skills = row['skills']
    companies = row['entities'].get('ORG', [])
    relations = extract_skill_company_relations(row['Resume'], skills, companies)
    all_relations.extend(relations)

# Create a graph of skill-company relationships
G = nx.Graph()
for skill, company in all_relations:
    G.add_edge(skill, company)

# Visualize the graph (if it's not too large)
if len(G) <= 30:  # Only visualize if the graph is manageable
    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(G, seed=42)
    
    # Draw nodes
    skill_nodes = [node for node in G.nodes() if node in common_skills]
    company_nodes = [node for node in G.nodes() if node not in common_skills]
    
    nx.draw_networkx_nodes(G, pos, nodelist=skill_nodes, node_color='blue', node_size=100, alpha=0.8)
    nx.draw_networkx_nodes(G, pos, nodelist=company_nodes, node_color='red', node_size=100, alpha=0.8)
    
    # Draw edges
    nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=8)
    
    plt.title('Skill-Company Relationships')
    plt.axis('off')
    plt.savefig('results/skill_company_graph.png')
    plt.close()
            

Contextual Skill Importance

Not all mentions of skills are equally important. Let's analyze the context to determine the importance of each skill:


def analyze_skill_importance(resume, skill):
    resume_lower = resume.lower()
    skill_lower = skill.lower()
    
    # Check if skill is in resume
    if skill_lower not in resume_lower:
        return 0
    
    # Base importance
    importance = 1
    
    # Check for indicators of expertise
    expertise_indicators = [
        'expert in', 'specialized in', 'proficient in',
        'advanced knowledge of', 'extensive experience with'
    ]
    
    for indicator in expertise_indicators:
        if f"{indicator} {skill_lower}" in resume_lower:
            importance += 2
    
    # Check for years of experience with this skill
    experience_patterns = [
        rf'(\d+)\s+years?\s+(?:of\s+)?experience\s+(?:with|in)\s+{skill_lower}',
        rf'(\d+)\s+years?\s+{skill_lower}\s+experience'
    ]
    
    for pattern in experience_patterns:
        matches = re.finditer(pattern, resume_lower)
        for match in matches:
            years = int(match.group(1))
            importance += min(years, 5)  # Cap at 5 years
    
    # Check if skill is mentioned in job title or summary (usually more important)
    lines = resume.split('\n')
    for i, line in enumerate(lines):
        if i < 5 and skill_lower in line.lower():  # First 5 lines (approximate header/summary)
            importance += 2
    
    return importance

# Apply to our dataset
for skill in common_skills[:5]:  # Analyze first 5 skills for demonstration
    resume_data[f'{skill}_importance'] = resume_data['Resume'].apply(
        lambda x: analyze_skill_importance(x, skill)
    )

# Display average importance for these skills
for skill in common_skills[:5]:
    avg_importance = resume_data[f'{skill}_importance'].mean()
    print(f"Average importance of {skill}: {avg_importance:.2f}")
            

Contextual analysis helps us understand not just what skills a candidate has, but how important those skills are to their experience and career. This provides a much richer understanding than simple keyword matching.

Next Steps

Now that we've performed contextual analysis on our resume data, we have a much richer understanding of the content. In the next step, we'll develop a scoring system that assigns a weightage score to each resume based on its relevance to specific job titles.