harshit.pathak
/
content_quality_tool


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942
							# # import requests
# # import json
# # from typing import Dict, List, Optional
# # from django.conf import settings


# # class ProductAttributeService:
# #     """Service class for extracting product attributes using Groq LLM."""

# #     @staticmethod
# #     def combine_product_text(
# #         title: Optional[str] = None,
# #         short_desc: Optional[str] = None,
# #         long_desc: Optional[str] = None
# #     ) -> str:
# #         """Combine product metadata into a single text block."""
# #         parts = []
# #         if title:
# #             parts.append(str(title).strip())
# #         if short_desc:
# #             parts.append(str(short_desc).strip())
# #         if long_desc:
# #             parts.append(str(long_desc).strip())

# #         combined = " ".join(parts).strip()

# #         if not combined:
# #             return "No product information available"

# #         return combined

# #     @staticmethod
# #     def extract_attributes(
# #         product_text: str,
# #         mandatory_attrs: Dict[str, List[str]],
# #         model: str = None,
# #         extract_additional: bool = True
# #     ) -> dict:
# #         """Use Groq LLM to extract attributes from any product type."""
        
# #         if model is None:
# #             model = settings.SUPPORTED_MODELS[0]

# #         # Check if product text is empty or minimal
# #         if not product_text or product_text == "No product information available":
# #             return ProductAttributeService._create_error_response(
# #                 "No product information provided",
# #                 mandatory_attrs,
# #                 extract_additional
# #             )

# #         # Create structured prompt for mandatory attributes
# #         mandatory_attr_list = []
# #         for attr_name, allowed_values in mandatory_attrs.items():
# #             mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
# #         mandatory_attr_text = "\n".join(mandatory_attr_list)

# #         additional_instruction = ""
# #         if extract_additional:
# #             additional_instruction = """
# # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
# #    (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.) 
# #    and their values. Extract attributes that are specific and relevant to this product type."""

# #         output_format = {
# #             "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
# #             "additional": {} if extract_additional else None
# #         }

# #         if not extract_additional:
# #             output_format.pop("additional")

# #         prompt = f"""
# # You are an intelligent product attribute extractor that works with ANY product type.

# # TASK:
# # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value 
# #    from the provided list. Choose the value that best matches the product description.
# # {additional_instruction}

# # Product Text:
# # {product_text}

# # Mandatory Attribute Lists (MUST select one value for each):
# # {mandatory_attr_text}

# # CRITICAL INSTRUCTIONS:
# # - Return ONLY valid JSON, nothing else
# # - No explanations, no markdown, no text before or after the JSON
# # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
# # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
# # - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
# # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
# # - Be precise and only extract information that is explicitly stated or clearly implied

# # Required Output Format (ONLY THIS, NO OTHER TEXT):
# # {json.dumps(output_format, indent=2)}
# #         """

# #         payload = {
# #             "model": model,
# #             "messages": [
# #                 {
# #                     "role": "system",
# #                     "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
# #                 },
# #                 {"role": "user", "content": prompt}
# #             ],
# #             "temperature": 0.0,
# #             "max_tokens": 1500
# #         }

# #         headers = {
# #             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
# #             "Content-Type": "application/json",
# #         }

# #         try:
# #             response = requests.post(
# #                 settings.GROQ_API_URL,
# #                 headers=headers,
# #                 json=payload,
# #                 timeout=30
# #             )
# #             response.raise_for_status()
# #             result_text = response.json()["choices"][0]["message"]["content"].strip()

# #             # Clean the response
# #             result_text = ProductAttributeService._clean_json_response(result_text)

# #             # Parse JSON
# #             parsed = json.loads(result_text)

# #             # Validate and restructure if needed
# #             parsed = ProductAttributeService._validate_response_structure(
# #                 parsed, mandatory_attrs, extract_additional
# #             )

# #             return parsed

# #         except requests.exceptions.RequestException as e:
# #             return ProductAttributeService._create_error_response(
# #                 str(e), mandatory_attrs, extract_additional
# #             )
# #         except json.JSONDecodeError as e:
# #             return ProductAttributeService._create_error_response(
# #                 f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
# #             )
# #         except Exception as e:
# #             return ProductAttributeService._create_error_response(
# #                 str(e), mandatory_attrs, extract_additional
# #             )

# #     @staticmethod
# #     def _clean_json_response(text: str) -> str:
# #         """Clean LLM response to extract valid JSON."""
# #         start_idx = text.find('{')
# #         end_idx = text.rfind('}')

# #         if start_idx != -1 and end_idx != -1:
# #             text = text[start_idx:end_idx + 1]

# #         if "```json" in text:
# #             text = text.split("```json")[1].split("```")[0].strip()
# #         elif "```" in text:
# #             text = text.split("```")[1].split("```")[0].strip()
# #             if text.startswith("json"):
# #                 text = text[4:].strip()

# #         return text

# #     @staticmethod
# #     def _validate_response_structure(
# #         parsed: dict,
# #         mandatory_attrs: Dict[str, List[str]],
# #         extract_additional: bool
# #     ) -> dict:
# #         """Validate and fix the response structure."""
# #         expected_sections = ["mandatory"]
# #         if extract_additional:
# #             expected_sections.append("additional")

# #         if not all(section in parsed for section in expected_sections):
# #             if isinstance(parsed, dict):
# #                 mandatory_keys = set(mandatory_attrs.keys())
# #                 mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
# #                 additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}

# #                 result = {"mandatory": mandatory}
# #                 if extract_additional:
# #                     result["additional"] = additional
# #                 return result
# #             else:
# #                 return ProductAttributeService._create_error_response(
# #                     "Invalid response structure",
# #                     mandatory_attrs,
# #                     extract_additional,
# #                     str(parsed)
# #                 )

# #         return parsed

# #     @staticmethod
# #     def _create_error_response(
# #         error: str,
# #         mandatory_attrs: Dict[str, List[str]],
# #         extract_additional: bool,
# #         raw_output: Optional[str] = None
# #     ) -> dict:
# #         """Create a standardized error response."""
# #         response = {
# #             "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
# #             "error": error
# #         }
# #         if extract_additional:
# #             response["additional"] = {}
# #         if raw_output:
# #             response["raw_output"] = raw_output
# #         return response


# import requests
# import json
# from typing import Dict, List, Optional
# from django.conf import settings
# from concurrent.futures import ThreadPoolExecutor, as_completed


# class ProductAttributeService:
#     """Service class for extracting product attributes using Groq LLM."""

#     @staticmethod
#     def combine_product_text(
#         title: Optional[str] = None,
#         short_desc: Optional[str] = None,
#         long_desc: Optional[str] = None
#     ) -> str:
#         """Combine product metadata into a single text block."""
#         parts = []
#         if title:
#             parts.append(str(title).strip())
#         if short_desc:
#             parts.append(str(short_desc).strip())
#         if long_desc:
#             parts.append(str(long_desc).strip())

#         combined = " ".join(parts).strip()

#         if not combined:
#             return "No product information available"

#         return combined

#     @staticmethod
#     def extract_attributes(
#         product_text: str,
#         mandatory_attrs: Dict[str, List[str]],
#         model: str = None,
#         extract_additional: bool = True
#     ) -> dict:
#         """Use Groq LLM to extract attributes from any product type."""
        
#         if model is None:
#             model = settings.SUPPORTED_MODELS[0]

#         # Check if product text is empty or minimal
#         if not product_text or product_text == "No product information available":
#             return ProductAttributeService._create_error_response(
#                 "No product information provided",
#                 mandatory_attrs,
#                 extract_additional
#             )

#         # Create structured prompt for mandatory attributes
#         mandatory_attr_list = []
#         for attr_name, allowed_values in mandatory_attrs.items():
#             mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
#         mandatory_attr_text = "\n".join(mandatory_attr_list)

#         additional_instruction = ""
#         if extract_additional:
#             additional_instruction = """
# 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
#    (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.) 
#    and their values. Extract attributes that are specific and relevant to this product type."""

#         output_format = {
#             "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
#             "additional": {} if extract_additional else None
#         }

#         if not extract_additional:
#             output_format.pop("additional")

#         prompt = f"""
# You are an intelligent product attribute extractor that works with ANY product type.

# TASK:
# 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value 
#    from the provided list. Choose the value that best matches the product description.
# {additional_instruction}

# Product Text:
# {product_text}

# Mandatory Attribute Lists (MUST select one value for each):
# {mandatory_attr_text}

# CRITICAL INSTRUCTIONS:
# - Return ONLY valid JSON, nothing else
# - No explanations, no markdown, no text before or after the JSON
# - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
# - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
# - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
# {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
# - Be precise and only extract information that is explicitly stated or clearly implied

# Required Output Format (ONLY THIS, NO OTHER TEXT):
# {json.dumps(output_format, indent=2)}
#         """

#         payload = {
#             "model": model,
#             "messages": [
#                 {
#                     "role": "system",
#                     "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
#                 },
#                 {"role": "user", "content": prompt}
#             ],
#             "temperature": 0.0,
#             "max_tokens": 1500
#         }

#         headers = {
#             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
#             "Content-Type": "application/json",
#         }

#         try:
#             response = requests.post(
#                 settings.GROQ_API_URL,
#                 headers=headers,
#                 json=payload,
#                 timeout=30
#             )
#             response.raise_for_status()
#             result_text = response.json()["choices"][0]["message"]["content"].strip()

#             # Clean the response
#             result_text = ProductAttributeService._clean_json_response(result_text)

#             # Parse JSON
#             parsed = json.loads(result_text)

#             # Validate and restructure if needed
#             parsed = ProductAttributeService._validate_response_structure(
#                 parsed, mandatory_attrs, extract_additional
#             )

#             return parsed

#         except requests.exceptions.RequestException as e:
#             return ProductAttributeService._create_error_response(
#                 str(e), mandatory_attrs, extract_additional
#             )
#         except json.JSONDecodeError as e:
#             return ProductAttributeService._create_error_response(
#                 f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
#             )
#         except Exception as e:
#             return ProductAttributeService._create_error_response(
#                 str(e), mandatory_attrs, extract_additional
#             )

#     @staticmethod
#     def extract_attributes_batch(
#         products: List[Dict],
#         mandatory_attrs: Dict[str, List[str]],
#         model: str = None,
#         extract_additional: bool = True,
#         max_workers: int = 5
#     ) -> Dict:
#         """
#         Extract attributes for multiple products in parallel.
        
#         Args:
#             products: List of product dictionaries with keys: product_id, title, short_desc, long_desc
#             mandatory_attrs: Dictionary of mandatory attributes
#             model: Groq model to use
#             extract_additional: Whether to extract additional attributes
#             max_workers: Maximum number of parallel workers
            
#         Returns:
#             Dictionary with results, total_products, successful, and failed counts
#         """
#         results = []
#         successful = 0
#         failed = 0

#         def process_product(product_data):
#             """Process a single product."""
#             product_id = product_data.get('product_id', f"product_{len(results)}")
            
#             try:
#                 product_text = ProductAttributeService.combine_product_text(
#                     title=product_data.get('title'),
#                     short_desc=product_data.get('short_desc'),
#                     long_desc=product_data.get('long_desc')
#                 )
                
#                 result = ProductAttributeService.extract_attributes(
#                     product_text=product_text,
#                     mandatory_attrs=mandatory_attrs,
#                     model=model,
#                     extract_additional=extract_additional
#                 )
                
#                 result['product_id'] = product_id
                
#                 # Check if extraction was successful
#                 if 'error' not in result:
#                     return result, True
#                 else:
#                     return result, False
                    
#             except Exception as e:
#                 return {
#                     'product_id': product_id,
#                     'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
#                     'additional': {} if extract_additional else None,
#                     'error': f"Processing error: {str(e)}"
#                 }, False

#         # Process products in parallel
#         with ThreadPoolExecutor(max_workers=max_workers) as executor:
#             future_to_product = {
#                 executor.submit(process_product, product): product 
#                 for product in products
#             }
            
#             for future in as_completed(future_to_product):
#                 try:
#                     result, success = future.result()
#                     results.append(result)
#                     if success:
#                         successful += 1
#                     else:
#                         failed += 1
#                 except Exception as e:
#                     failed += 1
#                     results.append({
#                         'product_id': 'unknown',
#                         'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
#                         'additional': {} if extract_additional else None,
#                         'error': f"Unexpected error: {str(e)}"
#                     })

#         return {
#             'results': results,
#             'total_products': len(products),
#             'successful': successful,
#             'failed': failed
#         }

#     @staticmethod
#     def _clean_json_response(text: str) -> str:
#         """Clean LLM response to extract valid JSON."""
#         start_idx = text.find('{')
#         end_idx = text.rfind('}')

#         if start_idx != -1 and end_idx != -1:
#             text = text[start_idx:end_idx + 1]

#         if "```json" in text:
#             text = text.split("```json")[1].split("```")[0].strip()
#         elif "```" in text:
#             text = text.split("```")[1].split("```")[0].strip()
#             if text.startswith("json"):
#                 text = text[4:].strip()

#         return text

#     @staticmethod
#     def _validate_response_structure(
#         parsed: dict,
#         mandatory_attrs: Dict[str, List[str]],
#         extract_additional: bool
#     ) -> dict:
#         """Validate and fix the response structure."""
#         expected_sections = ["mandatory"]
#         if extract_additional:
#             expected_sections.append("additional")

#         if not all(section in parsed for section in expected_sections):
#             if isinstance(parsed, dict):
#                 mandatory_keys = set(mandatory_attrs.keys())
#                 mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
#                 additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}

#                 result = {"mandatory": mandatory}
#                 if extract_additional:
#                     result["additional"] = additional
#                 return result
#             else:
#                 return ProductAttributeService._create_error_response(
#                     "Invalid response structure",
#                     mandatory_attrs,
#                     extract_additional,
#                     str(parsed)
#                 )

#         return parsed

#     @staticmethod
#     def _create_error_response(
#         error: str,
#         mandatory_attrs: Dict[str, List[str]],
#         extract_additional: bool,
#         raw_output: Optional[str] = None
#     ) -> dict:
#         """Create a standardized error response."""
#         response = {
#             "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
#             "error": error
#         }
#         if extract_additional:
#             response["additional"] = {}
#         if raw_output:
#             response["raw_output"] = raw_output
#         return response


# ==================== services.py ====================
import requests
import json
from typing import Dict, List, Optional
from django.conf import settings
from concurrent.futures import ThreadPoolExecutor, as_completed
from .ocr_service import OCRService


class ProductAttributeService:
    """Service class for extracting product attributes using Groq LLM."""

    @staticmethod
    def combine_product_text(
        title: Optional[str] = None,
        short_desc: Optional[str] = None,
        long_desc: Optional[str] = None,
        ocr_text: Optional[str] = None
    ) -> str:
        """Combine product metadata into a single text block."""
        parts = []
        if title:
            parts.append(f"Title: {str(title).strip()}")
        if short_desc:
            parts.append(f"Description: {str(short_desc).strip()}")
        if long_desc:
            parts.append(f"Details: {str(long_desc).strip()}")
        if ocr_text:
            parts.append(f"OCR Text: {ocr_text}")
        
        combined = "\n".join(parts).strip()
        
        if not combined:
            return "No product information available"
        
        return combined

    @staticmethod
    def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
        """Extract structured attributes from OCR text using LLM."""
        if model is None:
            model = settings.SUPPORTED_MODELS[0]
        
        detected_text = ocr_results.get('detected_text', [])
        if not detected_text:
            return {}
        
        # Format OCR text for prompt
        ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}" 
                              for item in detected_text])
        
        prompt = f"""
You are an AI model that extracts structured attributes from OCR text detected on product images.
Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.

OCR Text:
{ocr_text}

Extract relevant attributes like:
- brand
- model_number
- size (waist_size, length, etc.)
- collection
- any other relevant product information

Return a JSON object with only the attributes you can confidently identify.
If an attribute is not present, do not include it in the response.
"""
        
        payload = {
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
                },
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.2,
            "max_tokens": 500
        }
        
        headers = {
            "Authorization": f"Bearer {settings.GROQ_API_KEY}",
            "Content-Type": "application/json",
        }
        
        try:
            response = requests.post(
                settings.GROQ_API_URL,
                headers=headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            result_text = response.json()["choices"][0]["message"]["content"].strip()
            
            # Clean and parse JSON
            result_text = ProductAttributeService._clean_json_response(result_text)
            parsed = json.loads(result_text)
            
            return parsed
        except Exception as e:
            return {"error": f"Failed to extract attributes from OCR: {str(e)}"}

    @staticmethod
    def extract_attributes(
        product_text: str,
        mandatory_attrs: Dict[str, List[str]],
        model: str = None,
        extract_additional: bool = True
    ) -> dict:
        """Use Groq LLM to extract attributes from any product type."""
        
        if model is None:
            model = settings.SUPPORTED_MODELS[0]

        # Check if product text is empty or minimal
        if not product_text or product_text == "No product information available":
            return ProductAttributeService._create_error_response(
                "No product information provided",
                mandatory_attrs,
                extract_additional
            )

        # Create structured prompt for mandatory attributes
        mandatory_attr_list = []
        for attr_name, allowed_values in mandatory_attrs.items():
            mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
        mandatory_attr_text = "\n".join(mandatory_attr_list)

        additional_instruction = ""
        if extract_additional:
            additional_instruction = """
2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
   (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.) 
   and their values. Extract attributes that are specific and relevant to this product type."""

        output_format = {
            "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
            "additional": {} if extract_additional else None
        }

        if not extract_additional:
            output_format.pop("additional")

        prompt = f"""
You are an intelligent product attribute extractor that works with ANY product type.

TASK:
1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value 
   from the provided list. Choose the value that best matches the product description.
{additional_instruction}

Product Text:
{product_text}

Mandatory Attribute Lists (MUST select one value for each):
{mandatory_attr_text}

CRITICAL INSTRUCTIONS:
- Return ONLY valid JSON, nothing else
- No explanations, no markdown, no text before or after the JSON
- For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
- If a mandatory attribute cannot be determined from the product text, use "Not Specified"
- Work with whatever information is available - the product text may be incomplete
{f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
- Be precise and only extract information that is explicitly stated or clearly implied

Required Output Format (ONLY THIS, NO OTHER TEXT):
{json.dumps(output_format, indent=2)}
        """

        payload = {
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
                },
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.0,
            "max_tokens": 1500
        }

        headers = {
            "Authorization": f"Bearer {settings.GROQ_API_KEY}",
            "Content-Type": "application/json",
        }

        try:
            response = requests.post(
                settings.GROQ_API_URL,
                headers=headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            result_text = response.json()["choices"][0]["message"]["content"].strip()

            # Clean the response
            result_text = ProductAttributeService._clean_json_response(result_text)

            # Parse JSON
            parsed = json.loads(result_text)

            # Validate and restructure if needed
            parsed = ProductAttributeService._validate_response_structure(
                parsed, mandatory_attrs, extract_additional
            )

            return parsed

        except requests.exceptions.RequestException as e:
            return ProductAttributeService._create_error_response(
                str(e), mandatory_attrs, extract_additional
            )
        except json.JSONDecodeError as e:
            return ProductAttributeService._create_error_response(
                f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
            )
        except Exception as e:
            return ProductAttributeService._create_error_response(
                str(e), mandatory_attrs, extract_additional
            )

    @staticmethod
    def extract_attributes_batch(
        products: List[Dict],
        mandatory_attrs: Dict[str, List[str]],
        model: str = None,
        extract_additional: bool = True,
        process_image: bool = True,
        max_workers: int = 5
    ) -> Dict:
        """Extract attributes for multiple products in parallel."""
        results = []
        successful = 0
        failed = 0
        
        ocr_service = OCRService()

        def process_product(product_data):
            """Process a single product."""
            product_id = product_data.get('product_id', f"product_{len(results)}")
            
            try:
                # Process image if URL is provided
                ocr_results = None
                ocr_text = None
                
                if process_image and product_data.get('image_url'):
                    ocr_results = ocr_service.process_image(product_data['image_url'])
                    
                    # Extract attributes from OCR
                    if ocr_results and ocr_results.get('detected_text'):
                        ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
                            ocr_results, model
                        )
                        ocr_results['extracted_attributes'] = ocr_attrs
                        
                        # Format OCR text for combining with product text
                        ocr_text = "\n".join([
                            f"{item['text']} (confidence: {item['confidence']:.2f})"
                            for item in ocr_results['detected_text']
                        ])
                
                # Combine all product information
                product_text = ProductAttributeService.combine_product_text(
                    title=product_data.get('title'),
                    short_desc=product_data.get('short_desc'),
                    long_desc=product_data.get('long_desc'),
                    ocr_text=ocr_text
                )
                
                # Extract attributes from combined text
                result = ProductAttributeService.extract_attributes(
                    product_text=product_text,
                    mandatory_attrs=mandatory_attrs,
                    model=model,
                    extract_additional=extract_additional
                )
                
                result['product_id'] = product_id
                
                # Add OCR results if available
                if ocr_results:
                    result['ocr_results'] = ocr_results
                
                # Check if extraction was successful
                if 'error' not in result:
                    return result, True
                else:
                    return result, False
                    
            except Exception as e:
                return {
                    'product_id': product_id,
                    'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
                    'additional': {} if extract_additional else None,
                    'error': f"Processing error: {str(e)}"
                }, False

        # Process products in parallel
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_product = {
                executor.submit(process_product, product): product 
                for product in products
            }
            
            for future in as_completed(future_to_product):
                try:
                    result, success = future.result()
                    results.append(result)
                    if success:
                        successful += 1
                    else:
                        failed += 1
                except Exception as e:
                    failed += 1
                    results.append({
                        'product_id': 'unknown',
                        'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
                        'additional': {} if extract_additional else None,
                        'error': f"Unexpected error: {str(e)}"
                    })

        return {
            'results': results,
            'total_products': len(products),
            'successful': successful,
            'failed': failed
        }

    @staticmethod
    def _clean_json_response(text: str) -> str:
        """Clean LLM response to extract valid JSON."""
        start_idx = text.find('{')
        end_idx = text.rfind('}')

        if start_idx != -1 and end_idx != -1:
            text = text[start_idx:end_idx + 1]

        if "```json" in text:
            text = text.split("```json")[1].split("```")[0].strip()
        elif "```" in text:
            text = text.split("```")[1].split("```")[0].strip()
            if text.startswith("json"):
                text = text[4:].strip()

        return text

    @staticmethod
    def _validate_response_structure(
        parsed: dict,
        mandatory_attrs: Dict[str, List[str]],
        extract_additional: bool
    ) -> dict:
        """Validate and fix the response structure."""
        expected_sections = ["mandatory"]
        if extract_additional:
            expected_sections.append("additional")

        if not all(section in parsed for section in expected_sections):
            if isinstance(parsed, dict):
                mandatory_keys = set(mandatory_attrs.keys())
                mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
                additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}

                result = {"mandatory": mandatory}
                if extract_additional:
                    result["additional"] = additional
                return result
            else:
                return ProductAttributeService._create_error_response(
                    "Invalid response structure",
                    mandatory_attrs,
                    extract_additional,
                    str(parsed)
                )

        return parsed

    @staticmethod
    def _create_error_response(
        error: str,
        mandatory_attrs: Dict[str, List[str]],
        extract_additional: bool,
        raw_output: Optional[str] = None
    ) -> dict:
        """Create a standardized error response."""
        response = {
            "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
            "error": error
        }
        if extract_additional:
            response["additional"] = {}
        if raw_output:
            response["raw_output"] = raw_output
        return response