"""
Enhanced Performance Data Service
Provides comprehensive GSC and GA4 data fetching including:
- Rich Results
- Core Web Vitals
- Per-page queries
- Keyword cloud generation
"""

import json
import os
from datetime import datetime, timedelta
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    RunReportRequest, DateRange, Dimension, Metric, 
    FilterExpression, Filter
)
import matplotlib
matplotlib.use('Agg')  # Non-GUI backend
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image


class PerformanceDataService:
    """Service for fetching and processing performance data from GSC and GA4"""
    
    def __init__(self, app_instance_path):
        self.instance_path = app_instance_path
        self.wordcloud_dir = os.path.join(app_instance_path, 'wordclouds')
        os.makedirs(self.wordcloud_dir, exist_ok=True)
    
    def load_credentials(self, credentials_path):
        """Load OAuth credentials from JSON file"""
        with open(credentials_path, 'r') as f:
            creds_dict = json.load(f)
        
        return Credentials(
            token=creds_dict['token'],
            refresh_token=creds_dict.get('refresh_token'),
            token_uri=creds_dict['token_uri'],
            client_id=creds_dict['client_id'],
            client_secret=creds_dict['client_secret'],
            scopes=creds_dict['scopes']
        )
    
    def fetch_gsc_enhanced_data(self, credentials_path, property_id, urls, start_date, end_date):
        """
        Fetch comprehensive GSC data including:
        - Basic metrics (clicks, impressions, CTR, position)
        - Rich results data
        - Per-page queries
        - Search appearance data
        """
        creds = self.load_credentials(credentials_path)
        service = build('searchConsole', 'v1', credentials=creds)
        
        result = {}
        
        for url in urls:
            url_data = {
                'clicks': 0,
                'impressions': 0,
                'ctr': 0,
                'position': 0,
                'queries': [],
                'rich_results': [],
                'search_appearances': []
            }
            
            try:
                # 1. Basic performance metrics
                basic_response = service.searchanalytics().query(
                    siteUrl=property_id,
                    body={
                        'startDate': start_date,
                        'endDate': end_date,
                        'dimensions': ['page'],
                        'dimensionFilterGroups': [{
                            'filters': [{
                                'dimension': 'page',
                                'operator': 'equals',
                                'expression': url
                            }]
                        }]
                    }
                ).execute()
                
                if 'rows' in basic_response and len(basic_response['rows']) > 0:
                    row = basic_response['rows'][0]
                    url_data['clicks'] = row.get('clicks', 0)
                    url_data['impressions'] = row.get('impressions', 0)
                    url_data['ctr'] = row.get('ctr', 0) * 100
                    url_data['position'] = row.get('position', 0)
                
                # 2. Fetch queries for this page
                query_response = service.searchanalytics().query(
                    siteUrl=property_id,
                    body={
                        'startDate': start_date,
                        'endDate': end_date,
                        'dimensions': ['query'],
                        'dimensionFilterGroups': [{
                            'filters': [{
                                'dimension': 'page',
                                'operator': 'equals',
                                'expression': url
                            }]
                        }],
                        'rowLimit': 100  # Get top 100 queries
                    }
                ).execute()
                
                if 'rows' in query_response:
                    url_data['queries'] = [{
                        'query': row['keys'][0],
                        'clicks': row.get('clicks', 0),
                        'impressions': row.get('impressions', 0),
                        'ctr': row.get('ctr', 0) * 100,
                        'position': row.get('position', 0)
                    } for row in query_response['rows']]
                
                # 3. Fetch search appearance data (rich results)
                try:
                    appearance_response = service.searchanalytics().query(
                        siteUrl=property_id,
                        body={
                            'startDate': start_date,
                            'endDate': end_date,
                            'dimensions': ['searchAppearance'],
                            'dimensionFilterGroups': [{
                                'filters': [{
                                    'dimension': 'page',
                                    'operator': 'equals',
                                    'expression': url
                                }]
                            }]
                        }
                    ).execute()
                    
                    if 'rows' in appearance_response:
                        url_data['search_appearances'] = [
                            row['keys'][0] for row in appearance_response['rows']
                        ]
                        # Map search appearances to rich results
                        url_data['rich_results'] = self._map_search_appearances_to_rich_results(
                            url_data['search_appearances']
                        )
                except:
                    # Search appearance dimension may not be available for all properties
                    pass
                
            except Exception as e:
                print(f"Error fetching GSC data for {url}: {e}")
            
            result[url] = url_data
        
        return result
    
    def _map_search_appearances_to_rich_results(self, appearances):
        """Map GSC search appearances to rich result types"""
        mapping = {
            'RECIPE': 'Recipe',
            'JOB_LISTING': 'Job Posting',
            'REVIEW_SNIPPET': 'Review',
            'VIDEO': 'Video',
            'PRODUCT_SNIPPET': 'Product',
            'EVENT': 'Event',
            'FAQ': 'FAQ',
            'HOW_TO': 'How-to',
            'AMP_BLUE_LINK': 'AMP',
            'ANDROID_APP': 'App',
        }
        
        return [mapping.get(app, app) for app in appearances if app in mapping]
    
    def fetch_ga4_enhanced_data(self, credentials_path, property_id, urls, start_date, end_date):
        """
        Fetch comprehensive GA4 data including:
        - Traffic metrics (sessions, pageviews)
        - Engagement metrics
        - Core Web Vitals
        """
        creds = self.load_credentials(credentials_path)
        
        # Initialize GA4 client with credentials
        client = BetaAnalyticsDataClient(credentials=creds)
        
        result = {}
        
        for url in urls:
            url_data = {
                'sessions': 0,
                'pageviews': 0,
                'avg_session_duration': 0,
                'bounce_rate': 0,
                'conversions': 0,
                'cwv': {
                    'lcp': 0,
                    'fid': 0,
                    'cls': 0
                }
            }
            
            try:
                # Basic traffic metrics
                request = RunReportRequest(
                    property=f"properties/{property_id}",
                    date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
                    dimensions=[Dimension(name="pagePath")],
                    metrics=[
                        Metric(name="sessions"),
                        Metric(name="screenPageViews"),
                        Metric(name="averageSessionDuration"),
                        Metric(name="bounceRate"),
                        Metric(name="conversions")
                    ],
                    dimension_filter=FilterExpression(
                        filter=Filter(
                            field_name="pagePath",
                            string_filter=Filter.StringFilter(
                                match_type=Filter.StringFilter.MatchType.EXACT,
                                value=self._extract_path_from_url(url)
                            )
                        )
                    )
                )
                
                response = client.run_report(request)
                
                if response.row_count > 0:
                    row = response.rows[0]
                    url_data['sessions'] = int(row.metric_values[0].value) if row.metric_values[0].value else 0
                    url_data['pageviews'] = int(row.metric_values[1].value) if row.metric_values[1].value else 0
                    url_data['avg_session_duration'] = float(row.metric_values[2].value) if row.metric_values[2].value else 0
                    url_data['bounce_rate'] = float(row.metric_values[3].value) if row.metric_values[3].value else 0
                    url_data['conversions'] = int(row.metric_values[4].value) if row.metric_values[4].value else 0
                
                # Fetch Core Web Vitals if available
                # Note: CWV data requires specific setup in GA4
                try:
                    cwv_request = RunReportRequest(
                        property=f"properties/{property_id}",
                        date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
                        dimensions=[Dimension(name="pagePath")],
                        metrics=[
                            Metric(name="experimental_lcp"),
                            Metric(name="experimental_fid"),
                            Metric(name="experimental_cls")
                        ],
                        dimension_filter=FilterExpression(
                            filter=Filter(
                                field_name="pagePath",
                                string_filter=Filter.StringFilter(
                                    match_type=Filter.StringFilter.MatchType.EXACT,
                                    value=self._extract_path_from_url(url)
                                )
                            )
                        )
                    )
                    
                    cwv_response = client.run_report(cwv_request)
                    
                    if cwv_response.row_count > 0:
                        cwv_row = cwv_response.rows[0]
                        url_data['cwv']['lcp'] = float(cwv_row.metric_values[0].value) if cwv_row.metric_values[0].value else 0
                        url_data['cwv']['fid'] = float(cwv_row.metric_values[1].value) if cwv_row.metric_values[1].value else 0
                        url_data['cwv']['cls'] = float(cwv_row.metric_values[2].value) if cwv_row.metric_values[2].value else 0
                except:
                    # CWV metrics may not be available
                    pass
                
            except Exception as e:
                print(f"Error fetching GA4 data for {url}: {e}")
            
            result[url] = url_data
        
        return result
    
    def _extract_path_from_url(self, url):
        """Extract path from full URL for GA4 filtering"""
        from urllib.parse import urlparse
        parsed = urlparse(url)
        return parsed.path or '/'
    
    def generate_keyword_cloud(self, queries, url, report_id):
        """
        Generate a keyword cloud from query data
        Returns path to generated image
        """
        if not queries:
            return None
        
        # Create frequency dict weighted by clicks
        word_freq = {}
        for query_data in queries:
            query = query_data.get('query', '')
            clicks = query_data.get('clicks', 0)
            impressions = query_data.get('impressions', 0)
            
            # Weight by clicks + impressions for better representation
            weight = clicks * 2 + impressions
            
            # Split query into words and add to frequency
            words = query.lower().split()
            for word in words:
                # Filter out common stop words
                if len(word) > 2 and word not in ['the', 'and', 'for', 'with', 'from', 'this', 'that']:
                    word_freq[word] = word_freq.get(word, 0) + weight
        
        if not word_freq:
            return None
        
        # Generate word cloud
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='#0f172a',  # Dark background matching UI
            colormap='viridis',
            relative_scaling=0.5,
            min_font_size=10
        ).generate_from_frequencies(word_freq)
        
        # Save to file
        filename = f"wordcloud_report_{report_id}_url_{hash(url)}.png"
        filepath = os.path.join(self.wordcloud_dir, filename)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.savefig(filepath, bbox_inches='tight', facecolor='#0f172a')
        plt.close()
        
        return filepath
    
    def generate_insights_and_recommendations(self, url_metrics_list, totals, timeframe):
        """
        Generate AI-powered insights and actionable recommendations
        Returns: (insights_text, recommendations_list)
        """
        insights = []
        recommendations = []
        
        # Overall performance analysis
        insights.append("## 📊 Performance Analysis\n")
        insights.append(f"**Analysis Period:** {timeframe['start']} to {timeframe['end']}\n")
        insights.append(f"- Total Clicks: **{totals['clicks']:,}**")
        insights.append(f"- Total Impressions: **{totals['impressions']:,}**")
        insights.append(f"- Average CTR: **{totals['ctr']:.2f}%**")
        insights.append(f"- Average Position: **{totals.get('avg_position', 0):.1f}**\n")
        
        # CTR Analysis and Recommendations
        if totals['ctr'] < 2.0:
            insights.append("⚠️ **Low CTR Detected** - Below industry average (2-3%)\n")
            recommendations.append({
                'type': 'critical',
                'category': 'CTR Optimization',
                'title': 'Improve Click-Through Rate',
                'description': 'Your average CTR is below industry benchmarks',
                'actions': [
                    'Optimize meta titles to include primary keywords early',
                    'Write compelling meta descriptions with clear value propositions',
                    'Add structured data to enable rich snippets',
                    'Test emotional triggers and numbers in titles'
                ]
            })
        elif totals['ctr'] > 5.0:
            insights.append("✅ **Excellent CTR Performance** - Above industry average!\n")
        
        # Position Analysis
        avg_position = totals.get('avg_position', 0)
        if avg_position > 10:
            insights.append(f"⚠️ **Average Position Beyond Page 1** (Position {avg_position:.1f})\n")
            recommendations.append({
                'type': 'important',
                'category': 'Rankings',
                'title': 'Improve Search Rankings',
                'description': 'Most pages are ranking beyond the first page of search results',
                'actions': [
                    'Conduct keyword gap analysis to find easier ranking opportunities',
                    'Improve internal linking to target pages',
                    'Enhance content depth and quality',
                    'Build high-quality backlinks from relevant sources'
                ]
            })
        
        # Per-URL Analysis
        insights.append("\n## 🎯 Top Performing Pages\n")
        
        # Sort by clicks
        sorted_urls = sorted(url_metrics_list, key=lambda x: x.get('clicks', 0), reverse=True)
        
        for i, url_data in enumerate(sorted_urls[:5], 1):
            url = url_data.get('url', 'Unknown')
            clicks = url_data.get('clicks', 0)
            impressions = url_data.get('impressions', 0)
            ctr = url_data.get('ctr', 0)
            position = url_data.get('position', 0)
            
            insights.append(f"**{i}. {url}**")
            insights.append(f"   - Clicks: {clicks:,} | Impressions: {impressions:,} | CTR: {ctr:.2f}% | Position: {position:.1f}")
            
            # CWV Analysis
            cwv = url_data.get('cwv', {})
            if cwv:
                lcp = cwv.get('lcp', 0)
                cls = cwv.get('cls', 0)
                
                cwv_issues = []
                if lcp > 2.5:
                    cwv_issues.append(f"LCP: {lcp:.2f}s (should be < 2.5s)")
                if cls > 0.1:
                    cwv_issues.append(f"CLS: {cls:.3f} (should be < 0.1)")
                
                if cwv_issues:
                    insights.append(f"   - ⚠️ CWV Issues: {', '.join(cwv_issues)}")
            
            # Rich Results
            rich_results = url_data.get('rich_results', [])
            if rich_results:
                insights.append(f"   - ✨ Rich Results: {', '.join(rich_results)}")
            
            insights.append("")
        
        # Rich Results Opportunities
        pages_without_rich_results = [u for u in url_metrics_list if not u.get('rich_results')]
        if len(pages_without_rich_results) > len(url_metrics_list) * 0.5:
            recommendations.append({
                'type': 'opportunity',
                'category': 'Rich Results',
                'title': 'Implement Structured Data',
                'description': f'{len(pages_without_rich_results)} pages lack rich results',
                'actions': [
                    'Add relevant schema markup (Article, Product, Recipe, FAQ, etc.)',
                    'Test structured data with Google Rich Results Test',
                    'Monitor rich result performance in GSC',
                    'Prioritize high-traffic pages first'
                ]
            })
        
        # Core Web Vitals Recommendations
        pages_with_cwv_issues = []
        for url_data in url_metrics_list:
            cwv = url_data.get('cwv', {})
            if cwv.get('lcp', 0) > 2.5 or cwv.get('cls', 0) > 0.1:
                pages_with_cwv_issues.append(url_data.get('url'))
        
        if pages_with_cwv_issues:
            recommendations.append({
                'type': 'critical',
                'category': 'Core Web Vitals',
                'title': 'Fix Core Web Vitals Issues',
                'description': f'{len(pages_with_cwv_issues)} pages have CWV issues affecting user experience',
                'actions': [
                    'Optimize LCP by improving server response times and resource loading',
                    'Reduce CLS by specifying image dimensions and avoiding layout shifts',
                    'Use CDN and optimize images',
                    'Implement lazy loading for below-fold content'
                ]
            })
        
        # Query Diversity Analysis
        insights.append("\n## 🔍 Keyword Insights\n")
        
        total_queries = sum(len(u.get('queries', [])) for u in url_metrics_list)
        if total_queries > 0:
            insights.append(f"- Total unique queries tracked: **{total_queries}**")
            
            # Find pages with high query diversity
            high_diversity_pages = [u for u in url_metrics_list if len(u.get('queries', [])) > 20]
            if high_diversity_pages:
                insights.append(f"- {len(high_diversity_pages)} pages ranking for 20+ keywords")
        
        insights_text = '\n'.join(insights)
        
        return insights_text, recommendations
