#!/usr/bin/env python3
"""
Parse resume PDF and output structured JSON.
Usage: python parse_resume_pdf.py <pdf_path>
Output: JSON to stdout
"""
import sys
import json
import re

def extract_text_from_pdf(pdf_path):
    try:
        from pypdf import PdfReader
        reader = PdfReader(pdf_path)
        return ''.join(page.extract_text() or '' for page in reader.pages)
    except Exception as e:
        return None

def parse_resume(text, first_name='', last_name=''):
    """Parse resume text into structured data."""
    result = {
        'full_name': '',
        'location': '',
        'headline': '',
        'about': '',
        'about_extended': '',
        'open_to_work_roles': '',
        'experiences': [],
        'education': [],
        'skills': [],
        'top_skills': [],
        'certifications': []
    }
    
    if not text or not text.strip():
        result['full_name'] = f"{first_name} {last_name}".strip() if (first_name or last_name) else ''
        return result
    
    text_clean = text.replace('\r', '\n')
    lines = [l.strip() for l in text_clean.split('\n') if l.strip()]
    
    # Extract contact line (usually last few lines with email, phone)
    email_match = re.search(r'[\w.+-]+@[\w.-]+\.\w+', text)
    phone_match = re.search(r'0\d{2,4}[-\s]?\d{6,7}', text)
    
    # Try to find name and location - often at end before contact line
    last_lines = [l.strip() for l in text.strip().split('\n') if l.strip()][-15:]
    for i, line in enumerate(last_lines):
        if re.match(r'^[A-Z][a-z]+\s+[A-Z][a-z]+$', line) and len(line) < 40 and ',' not in line:
            result['full_name'] = line
            if i + 1 < len(last_lines) and (',' in last_lines[i + 1] or re.match(r'^[A-Za-z\s,]+$', last_lines[i + 1])):
                result['location'] = last_lines[i + 1]
            break
    if not result['full_name'] and (first_name or last_name):
        result['full_name'] = f"{first_name} {last_name}".strip()
    
    # Section parsing
    section_headers = ['SKILLS', 'EXPERIENCE', 'EDUCATION', 'COURSEWORK', 'VOLUNTEER', 'CERTIFICATES', 'CERTIFICATIONS', 'INTERESTS', 'SUMMARY', 'ABOUT']
    
    def get_section(content, start_marker, end_markers):
        idx = content.upper().find(start_marker.upper())
        if idx < 0:
            return ''
        start = idx + len(start_marker)
        end = len(content)
        for em in end_markers:
            eidx = content.upper().find(em.upper(), start)
            if eidx >= 0:
                end = min(end, eidx)
        return content[start:end].strip()
    
    # Parse SKILLS - extract all skill keywords
    skills_section = get_section(text, 'SKILLS', ['EXPERIENCE', 'EDUCATION', 'WORK', 'EMPLOYMENT'])
    if skills_section:
        # Extract skills from patterns like "CATEGORY| skill1, skill2" or "skill1 • skill2"
        skill_items = re.findall(r'[A-Z][A-Z\s&]+[|:]\s*([^\n]+)', skills_section)
        all_skills = []
        for s in skill_items:
            parts = re.split(r'[•\|,\s]+', s)
            all_skills.extend([p.strip() for p in parts if len(p.strip()) > 1 and len(p.strip()) < 50])
        # Also get standalone tech words
        tech_words = re.findall(r'\b(?:PHP|JavaScript|Python|ReactJS?|NodeJS?|HTML|CSS|MySQL|MongoDB|Firebase|Flutter|WordPress|Figma|Adobe|Bootstrap|Redux|Express|Agile|REST|MVC)\b', text, re.I)
        all_skills = list(dict.fromkeys(all_skills + tech_words))[:20]
        result['top_skills'] = all_skills[:5]
        for i, sk in enumerate(all_skills[:15]):
            result['skills'].append({'skill_name': sk, 'institution': '', 'is_top_skill': 1 if sk in result['top_skills'] else 0})
    
    # Build headline from top skills
    if result['top_skills']:
        result['headline'] = ' | '.join(result['top_skills'][:8])
    else:
        result['headline'] = 'Software Engineer | Web Developer'
    
    result['open_to_work_roles'] = 'Software Engineer, Full-Stack Developer, Web Developer'
    
    # Parse EXPERIENCE
    exp_section = get_section(text, 'EXPERIENCE', ['EDUCATION', 'COURSEWORK', 'VOLUNTEER', 'CERTIFICATES'])
    if exp_section:
        # Pattern: "Company | skills" on one line, "Job Title | dates, location" on next
        exp_blocks = re.split(r'\n(?=[A-Z][a-zA-Z\s&]+(?:\s+\||\s+·))', exp_section)
        for block in exp_blocks:
            block = block.strip()
            if len(block) < 20:
                continue
            lines_block = [l.strip() for l in block.split('\n') if l.strip()]
            if len(lines_block) >= 2:
                first_line = lines_block[0]
                second_line = lines_block[1]
                # "Company | skills" or "Company name"
                if '|' in first_line:
                    parts = first_line.split('|', 1)
                    company = parts[0].strip()
                    skills_str = parts[1].strip() if len(parts) > 1 else ''
                else:
                    company = first_line
                    skills_str = ''
                # "Job Title | dates, location"
                if '|' in second_line or '·' in second_line:
                    title_parts = re.split(r'\s+[|·]\s+', second_line, 1)
                    job_title = title_parts[0].strip()
                    date_location = title_parts[1].strip() if len(title_parts) > 1 else ''
                else:
                    job_title = second_line
                    date_location = ''
                # Parse dates
                date_match = re.search(r'([A-Za-z]{3,9}\s+\d{4})\s*[–\-]\s*([A-Za-z]{3,9}\s+\d{4}|Present|[\w\s]+)', date_location)
                start_date = date_match.group(1) if date_match else ''
                end_date = date_match.group(2) if date_match and date_match.lastindex >= 2 else ''
                duration_match = re.search(r'·\s*(\d+\s*mos?)', date_location)
                duration = duration_match.group(1) if duration_match else ''
                location = re.sub(r'[A-Za-z]{3}\s+\d{4}.*$', '', date_location).replace('·', '').strip()
                desc = '\n'.join(lines_block[2:])[:500] if len(lines_block) > 2 else ''
                result['experiences'].append({
                    'company_name': company[:255],
                    'job_title': job_title[:255],
                    'employment_type': 'Internship' if 'intern' in job_title.lower() or 'intern' in company.lower() else 'Full-time',
                    'start_date': start_date,
                    'end_date': end_date,
                    'duration': duration,
                    'location': location[:255],
                    'skills': skills_str[:500],
                    'description': desc
                })
    
    # Parse EDUCATION
    edu_section = get_section(text, 'EDUCATION', ['COURSEWORK', 'VOLUNTEER', 'CERTIFICATES', 'EXPERIENCE', 'SKILLS'])
    if edu_section:
        edu_blocks = re.split(r'\n(?=[A-Z][a-zA-Z\s&()]+(?:University|College|School|Institute))', edu_section)
        for block in edu_blocks:
            block = block.strip()
            if len(block) < 15:
                continue
            lines_block = [l.strip() for l in block.split('\n') if l.strip()]
            if lines_block:
                institution = lines_block[0]
                degree = ''
                dates = ''
                grade = ''
                location_edu = ''
                skills_edu = ''
                for line in lines_block[1:]:
                    if re.match(r'^[A-Za-z\.\s,]+,', line) or 'B\.?S\.?' in line or 'Bachelors' in line or 'I\.?C\.?S' in line or 'M\.?S' in line:
                        degree = line
                    elif re.match(r'[A-Za-z]{3}\s+\d{4}', line):
                        dates = line
                        if '|' in line:
                            parts = line.split('|')
                            dates = parts[0].strip()
                            location_edu = parts[-1].strip() if len(parts) > 1 else ''
                    elif re.match(r'(?:CGPA|Grade|GPA)[:\s]*', line, re.I):
                        grade = line
                    elif '·' in line or '|' in line:
                        skills_edu = line
                result['education'].append({
                    'institution_name': institution[:255],
                    'degree': degree[:255] or 'N/A',
                    'start_date': '',
                    'end_date': dates,
                    'grade': grade[:100],
                    'location': location_edu[:255],
                    'skills': skills_edu[:500]
                })
    
    # Parse CERTIFICATES
    cert_section = get_section(text, 'CERTIFICATES', ['INTERESTS', 'EDUCATION', 'SKILLS']) or get_section(text, 'CERTIFICATIONS', ['INTERESTS', 'EDUCATION'])
    if cert_section:
        cert_lines = [l.strip() for l in cert_section.split('\n') if l.strip() and len(l.strip()) > 5]
        i = 0
        while i < len(cert_lines):
            title = cert_lines[i]
            issuer = ''
            date = ''
            if i + 1 < len(cert_lines):
                next_line = cert_lines[i + 1]
                if not re.match(r'^[A-Z][a-z]', next_line) or len(next_line) < 30:
                    issuer = next_line
                    i += 1
                if i + 1 < len(cert_lines) and re.search(r'[A-Za-z]{3}\s+\d{4}', cert_lines[i + 1]):
                    date = cert_lines[i + 1]
                    i += 1
            i += 1
            if len(title) > 3:
                result['certifications'].append({
                    'title': title[:500],
                    'issuer': issuer[:255],
                    'issued_date': date[:100],
                    'skills': ''
                })
    
    # Build about from skills + education
    if not result['about'] and (result['education'] or result['skills']):
        edu_names = [e['institution_name'] for e in result['education'][:1]]
        skill_str = ', '.join(result['top_skills'][:8]) if result['top_skills'] else 'full-stack development'
        result['about'] = f"Software Engineering graduate with experience in {skill_str}. Skilled in building web applications with clean architecture and user-focused design."
        result['about_extended'] = f"Experienced in RESTful APIs, MVC, and Agile methodologies. Proficient with modern development tools. Strong soft skills in leadership, communication, and problem-solving."
    
    return result

def main():
    if len(sys.argv) < 2:
        print(json.dumps({'error': 'No PDF path provided'}), file=sys.stderr)
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    first_name = sys.argv[2] if len(sys.argv) > 2 else ''
    last_name = sys.argv[3] if len(sys.argv) > 3 else ''
    
    text = extract_text_from_pdf(pdf_path)
    if text is None:
        result = {'error': 'Could not extract text from PDF', 'full_name': f"{first_name} {last_name}".strip()}
    else:
        result = parse_resume(text, first_name, last_name)
    
    print(json.dumps(result, ensure_ascii=False))

if __name__ == '__main__':
    main()
