
import os
import argparse
import shutil
import re
from bs4 import BeautifulSoup
from datetime import datetime

def clean_filename(text):
    # Basic slugify
    text = text.lower()
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'[-\s]+', '-', text).strip('-')
    return text[:50] # Limit length

def convert_posts(input_dir, output_dir, visibility_filter, no_images, min_text_length, show_visibility):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    images_dir = os.path.join(output_dir, "images")
    if not no_images and not os.path.exists(images_dir):
        os.makedirs(images_dir)

    count_total = 0
    count_converted = 0
    count_skipped_visibility = 0
    count_skipped_content = 0
    count_skipped_duplicate = 0
    
    converted_files = [] # List of (date, filename, title) for index
    seen_content_hashes = set()

    allowed_visibilities = [v.strip().lower() for v in visibility_filter.split(',')]

    print(f"Scanning {input_dir}...")
    
    files = sorted([f for f in os.listdir(input_dir) if f.endswith('.html')])
    
    for filename in files:
        filepath = os.path.join(input_dir, filename)
        count_total += 1
        
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')

            # 1. Visibility Filter
            vis_div = soup.find('div', class_='visibility')
            if not vis_div:
                count_skipped_visibility += 1
                continue
            
            vis_text = vis_div.get_text(strip=True).lower()
            
            is_visible = False
            for v in allowed_visibilities:
                if v in vis_text:
                    is_visible = True
                    break
            
            if not is_visible:
                count_skipped_visibility += 1
                continue

            # 2. Content Filter (Pre-check)
            
            body = soup.find('body')
            content_blocks = []
            
            # defined 'ignored' classes or styles
            ignored_classes = ['visibility', 'post-activity', 'comments', 'plus-oners', 'resharers', 'location']
            
            # Find the true header div (contains author info)
            header_div = None
            if body:
                # Look for the specific flex headers we've seen
                # Candidates: divs with display:flex
                flex_divs = body.find_all('div', style=lambda s: s and 'display:flex' in s)
                for d in flex_divs:
                    # Check if it contains author info
                    if d.find('img', class_='author-photo') or d.find('a', class_='author'):
                        header_div = d
                        break
            
            if header_div and body:
                # Strategy: Start at header, collect all following siblings.
                # Then move to parent, collect all IT'S following siblings.
                # Stop at body.
                
                curr = header_div
                while curr and curr != body:
                    for sibling in curr.next_siblings:
                        if sibling.name in ['div', 'a', 'p', 'img', 'header', 'footer']:
                            classes = sibling.get('class', [])
                            if any(c in ignored_classes for c in classes):
                                continue
                            content_blocks.append(sibling)
                    curr = curr.parent
            else:
                # Fallback if no header found (unlikely for valid posts)
                if body:
                    for child in body.find_all(['div', 'a', 'p', 'img'], recursive=False):
                        classes = child.get('class', [])
                        if any(c in ignored_classes for c in classes):
                            continue
                        content_blocks.append(child)

            # Calculate total text length from all blocks
            # REFINEMENT: Exclude captions/text from albums/media for filtering purposes 
            # as they often contain repetitive metadata (like "Reading, Massachusetts")
            full_text = ""
            filtering_text = ""
            for block in content_blocks:
                block_text = block.get_text(separator=' ', strip=True) + " "
                full_text += block_text
                
                # If this block is an album or media-related, don't count its text for filtering
                is_media_container = 'album' in str(block.get('class', '')) or \
                                   'media' in str(block.get('class', '')) or \
                                   'media-link' in str(block.get('class', ''))
                
                if not is_media_container:
                    filtering_text += block_text
            
            full_text = full_text.strip()
            filtering_text = filtering_text.strip()
            
            # Deduplication check
            # We need raw_date and author for dedupe and header
            raw_date_str = ""
            author_name = "Unknown Author"
            if header_div:
                links = header_div.find_all('a')
                for link in links:
                    text = link.get_text(strip=True)
                    if re.match(r'\d{4}-\d{2}-\d{2}', text):
                        raw_date_str = text
                    elif 'author' in str(link.get('class', '')):
                         author_name = text
                
                # If author_name not found by class, check if it's the first link
                if author_name == "Unknown Author" and links:
                    author_name = links[0].get_text(strip=True)
            
            import hashlib
            content_sig = f"{raw_date_str}|{full_text}"
            content_hash = hashlib.md5(content_sig.encode('utf-8')).hexdigest()
            
            if content_hash in seen_content_hashes:
                count_skipped_duplicate += 1
                continue
            
            seen_content_hashes.add(content_hash)
            
            # Check for specific content types to adjust filtering
            has_album = len(soup.find_all(class_=['album', 'media'])) > 0
            has_link = len(soup.find_all(class_=['link-embed', 'media-link'])) > 0
            has_reshare = len(soup.find_all(class_='reshare-attribution')) > 0
            
            text_len = len(filtering_text)
            
            # REFINEMENT: Posts with ONLY local G+ image links should be treated like album posts
            has_local_photo_link = False
            for block in content_blocks:
                for a in block.find_all('a', href=True):
                    if '/Photos/' in a['href']:
                        has_local_photo_link = True
                        break
            
            if has_album or has_local_photo_link:
                if text_len < min_text_length:
                     count_skipped_content += 1
                     continue
            elif has_link or has_reshare:
                if text_len == 0:
                     count_skipped_content += 1
                     continue
            else:
                if text_len == 0:
                    count_skipped_content += 1
                    continue

            
            # 3. Extraction & Conversion
            # ... (ymd calculation)
            try:
                ymd = raw_date_str[:10]
            except:
                ymd = "0000-00-00"

            # ... (slug calculation)
            clean_text_for_slug = full_text.split('\n')[0] if full_text else "post"
            slug = clean_filename(clean_text_for_slug[:50])
            if not slug: 
                slug = "post"
                
            out_filename = f"{ymd}-{slug}.html"
            out_filepath = os.path.join(output_dir, out_filename)
            
            # ... (filename collision check)
            counter = 1
            while os.path.exists(out_filepath):
                out_filename = f"{ymd}-{slug}-{counter}.html"
                out_filepath = os.path.join(output_dir, out_filename)
                counter += 1

            # 4. Content Cleaning / Image Copying
            
            out_soup = BeautifulSoup("<!DOCTYPE html><html><head><meta charset='utf-8'><title></title><link rel='stylesheet' href='style.css'></head><body><article class='gplus-post'></article></body></html>", 'html.parser')
            article = out_soup.find('article')
            out_soup.title.string = full_text[:60] + "..." if full_text else "G+ Post"
            
            # Header
            header_html = out_soup.new_tag('header')
            
            # Standard Archive Header
            archive_header = out_soup.new_tag('p', attrs={'class': 'archive-header'})
            archive_header.string = f"Google Plus Post by {author_name} from {raw_date_str}"
            header_html.append(archive_header)

            if show_visibility:
                vis_p = out_soup.new_tag('p', attrs={'class': 'post-visibility'})
                vis_p.string = f"Original Visibility: {vis_text}"
                header_html.append(vis_p)
            
            article.append(header_html)
            
            # Body
            body_div = out_soup.new_tag('div', attrs={'class': 'post-body'})
            
            # Add content blocks
            for block in content_blocks:
                new_block = block.__copy__()
                
                # REFINEMENT: Detect reshares and link-embeds for styling
                reshare_attr = new_block.find(class_='reshare-attribution')
                if reshare_attr:
                    # Add colon if not present
                    if not reshare_attr.get_text().strip().endswith(':'):
                        reshare_attr.append(':')
                    
                    # Unwrap all internal links
                    for link in reshare_attr.find_all('a'):
                        link.unwrap()
                    
                    # If the attribution itself is an 'a', convert it to a span and strip link attributes
                    if reshare_attr.name == 'a':
                        reshare_attr.name = 'span'
                        if reshare_attr.has_attr('href'): del reshare_attr['href']
                        if reshare_attr.has_attr('target'): del reshare_attr['target']

                    # Pull attribution out and append to body separately
                    reshare_attr_extracted = reshare_attr.extract()
                    body_div.append(reshare_attr_extracted)
                    
                    # The remainder of the block becomes the quote box
                    new_block['class'] = new_block.get('class', []) + ['reshare']

                
                if new_block.find(class_='link-embed'):
                    # The block itself might contain the link-embed, or be the link-embed
                    # REFINEMENT: Only add wrapper if NOT already a reshare to avoid conflict
                    if 'link-embed' not in new_block.get('class', []) and 'reshare' not in new_block.get('class', []):
                         new_block['class'] = new_block.get('class', []) + ['link-embed-wrapper']

                # Cleanup internal G+ links like profile mentions and strip ignored classes recursively
                # REFINEMENT: Also strip dead links to local photos
                for a in new_block.find_all('a'):
                    href = a.get('href', '')
                    classes = a.get('class', [])
                    
                    # Strip local photo links - they are dead ends
                    if '/Photos/' in href:
                        a.decompose()
                        continue

                    if any(c in ignored_classes for c in classes):
                        a.decompose()
                        continue
                    if 'plus.google.com' in href or 'profile' in str(classes):
                         a.unwrap()
                
                # Broad cleanup for any other tags with ignored classes
                for tag in new_block.find_all(class_=ignored_classes):
                    tag.decompose()

                if no_images:
                    # REFINEMENT: Remove images if --no-images, but KEEP external ones
                    for img in new_block.find_all('img'):
                        src = img.get('src', '')
                        if src.startswith('..') or not (src.startswith('http://') or src.startswith('https://')):
                            img.decompose()
                    # Also cleanup empty links
                    for a in new_block.find_all('a'):
                        if not a.get_text(strip=True) and not a.find_all('img'):
                            a.decompose()
                else:
                    # Process images
                    for img in new_block.find_all('img'):
                        src = img.get('src')
                        if src:
                            if src.startswith('..'):
                                try:
                                    from urllib.parse import unquote
                                    src_decoded = unquote(src)
                                    src_path = os.path.normpath(os.path.join(input_dir, src_decoded))
                                    
                                    if os.path.exists(src_path):
                                        img_filename = os.path.basename(src_path)
                                        dest_path = os.path.join(images_dir, img_filename)
                                        if not os.path.exists(dest_path):
                                            shutil.copy2(src_path, dest_path)
                                        
                                        img['src'] = f"images/{img_filename}"
                                        del img['width']
                                        del img['height']
                                        del img['style']
                                    else:
                                        print(f"Warning: Image not found {src_path}")
                                except Exception as e:
                                    print(f"Error processing image {src}: {e}")

                # REFINEMENT: Skip effectively empty blocks (no text, no meaningful tags left)
                # Especially helpful for div.album or div.post-activity after stripping
                text_content = new_block.get_text(strip=True)
                has_imgs = len(new_block.find_all('img')) > 0
                has_links = len(new_block.find_all('a')) > 0
                
                if not text_content and not has_imgs and not has_links:
                    continue

                body_div.append(new_block)

            # Comments (append separately at end to ensure order?)
            # Actually, we excluded 'comments' from content_blocks above.
            # So we need to fetch and append them now.
            comments_div = soup.find('div', class_='comments')
            if comments_div:
                new_comments = comments_div.__copy__()
                 # Clean profile links in comments too
                for a in new_comments.find_all('a'):
                    href = a.get('href', '')
                    if 'plus.google.com' in href or 'profile' in str(a.get('class', '')):
                         a.unwrap()
                
                body_div.append(new_comments)

            article.append(body_div)
            
            # Write Output
            with open(out_filepath, 'w', encoding='utf-8') as f_out:
                f_out.write(str(out_soup))
                
            converted_files.append((ymd, out_filename, full_text[:100] + "..." if full_text else "No Text"))
            count_converted += 1

        except Exception as e:
            print(f"Error converting {filename}: {e}")
            import traceback
            traceback.print_exc()

    print(f"Skipped (Duplicate): {count_skipped_duplicate}")
    print(f"Skipped (Visibility): {count_skipped_visibility}")
    print(f"Skipped (Content/Low Value): {count_skipped_content}")
    print(f"Converted: {count_converted}")
    
    generate_index(output_dir)
    print("-" * 30)

def generate_index(output_dir):
    """Scan output_dir and generate/regenerate index.html"""
    print(f"Regenerating index at {output_dir}/index.html...")
    converted_files = []
    
    # Scan for converted HTML files (excluding index.html)
    files = [f for f in os.listdir(output_dir) if f.endswith('.html') and f != 'index.html']
    
    for filename in files:
        filepath = os.path.join(output_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
                
                # Extract date from post-date or archive-header
                date_str = "0000-00-00"
                # Check for p.post-date first if it exists
                date_p = soup.find('p', class_='post-date')
                if date_p:
                    date_str = date_p.get_text(strip=True)[:10]
                else:
                    # Fallback to archive-header parsing
                    header_p = soup.find('p', class_='archive-header')
                    if header_p:
                        match = re.search(r'\d{4}-\d{2}-\d{2}', header_p.get_text())
                        if match:
                            date_str = match.group(0)
                        else:
                            # Try to extract from filename
                            match = re.match(r'^(\d{4}-\d{2}-\d{2})', filename)
                            if match: date_str = match.group(1)
                
                # Extract title from h1 or first few words of post-body
                title = "G+ Post"
                body = soup.find('div', class_='post-body')
                if body:
                    title = body.get_text(strip=True)[:100]
                
                converted_files.append((date_str, filename, title))
        except Exception as e:
            print(f"Warning: Could not parse {filename} for index: {e}")

    index_path = os.path.join(output_dir, "index.html")
    with open(index_path, 'w', encoding='utf-8') as f_idx:
        f_idx.write("<!DOCTYPE html><html><head><meta charset='utf-8'><title>G+ Archive Index</title>")
        f_idx.write("<link rel='stylesheet' href='style.css'></head><body><div class='index-container'>")
        f_idx.write("<h1>Google+ Archive Index</h1>")
        f_idx.write(f"<p>Total Posts: {len(converted_files)}</p>")
        f_idx.write("<table class='index-table'><thead><tr><th>Date</th><th>Post Preview</th></tr></thead><tbody>")
        for date, fname, title in sorted(converted_files, reverse=True):
             truncated_title = title[:60] + "..." if len(title) > 60 else title
             f_idx.write(f"<tr><td class='date-col'>{date}</td><td class='title-col'><a href='{fname}'>{truncated_title}</a></td></tr>")
        f_idx.write("</tbody></table></div></body></html>")
    print(f"Index generated with {len(converted_files)} entries.")

def main():
    parser = argparse.ArgumentParser(description='Convert G+ Archive to Static Site')
    parser.add_argument('--input-dir', help='Path to G+ Posts directory')
    parser.add_argument('--output-dir', required=True, help='Destination directory')
    parser.add_argument('--visibility', default='Public', help='Comma-sep list of allowed visibilities')
    parser.add_argument('--no-images', action='store_true', help='Strip images')
    parser.add_argument('--min-text-length', type=int, default=200, help='Min text length for image posts')
    parser.add_argument('--clean', action='store_true', help='Clean output directory before converting')
    parser.add_argument('--show-visibility', action='store_true', help='Show visibility subhead')
    parser.add_argument('--index-only', action='store_true', help='Only regenerate index.html')
    
    args = parser.parse_args()
    
    if args.index_only:
        generate_index(args.output_dir)
        return

    if not args.input_dir:
        parser.error("--input-dir is required unless --index-only is specified")

    if args.clean and os.path.exists(args.output_dir):
        print(f"Cleaning {args.output_dir}...")
        for f in os.listdir(args.output_dir):
            if f.endswith('.html'):
                os.remove(os.path.join(args.output_dir, f))
    
    convert_posts(
        args.input_dir,
        args.output_dir,
        args.visibility,
        args.no_images,
        args.min_text_length,
        args.show_visibility
    )

if __name__ == "__main__":
    main()
