services.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942
  1. # # import requests
  2. # # import json
  3. # # from typing import Dict, List, Optional
  4. # # from django.conf import settings
  5. # # class ProductAttributeService:
  6. # # """Service class for extracting product attributes using Groq LLM."""
  7. # # @staticmethod
  8. # # def combine_product_text(
  9. # # title: Optional[str] = None,
  10. # # short_desc: Optional[str] = None,
  11. # # long_desc: Optional[str] = None
  12. # # ) -> str:
  13. # # """Combine product metadata into a single text block."""
  14. # # parts = []
  15. # # if title:
  16. # # parts.append(str(title).strip())
  17. # # if short_desc:
  18. # # parts.append(str(short_desc).strip())
  19. # # if long_desc:
  20. # # parts.append(str(long_desc).strip())
  21. # # combined = " ".join(parts).strip()
  22. # # if not combined:
  23. # # return "No product information available"
  24. # # return combined
  25. # # @staticmethod
  26. # # def extract_attributes(
  27. # # product_text: str,
  28. # # mandatory_attrs: Dict[str, List[str]],
  29. # # model: str = None,
  30. # # extract_additional: bool = True
  31. # # ) -> dict:
  32. # # """Use Groq LLM to extract attributes from any product type."""
  33. # # if model is None:
  34. # # model = settings.SUPPORTED_MODELS[0]
  35. # # # Check if product text is empty or minimal
  36. # # if not product_text or product_text == "No product information available":
  37. # # return ProductAttributeService._create_error_response(
  38. # # "No product information provided",
  39. # # mandatory_attrs,
  40. # # extract_additional
  41. # # )
  42. # # # Create structured prompt for mandatory attributes
  43. # # mandatory_attr_list = []
  44. # # for attr_name, allowed_values in mandatory_attrs.items():
  45. # # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  46. # # mandatory_attr_text = "\n".join(mandatory_attr_list)
  47. # # additional_instruction = ""
  48. # # if extract_additional:
  49. # # additional_instruction = """
  50. # # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  51. # # (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.)
  52. # # and their values. Extract attributes that are specific and relevant to this product type."""
  53. # # output_format = {
  54. # # "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
  55. # # "additional": {} if extract_additional else None
  56. # # }
  57. # # if not extract_additional:
  58. # # output_format.pop("additional")
  59. # # prompt = f"""
  60. # # You are an intelligent product attribute extractor that works with ANY product type.
  61. # # TASK:
  62. # # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value
  63. # # from the provided list. Choose the value that best matches the product description.
  64. # # {additional_instruction}
  65. # # Product Text:
  66. # # {product_text}
  67. # # Mandatory Attribute Lists (MUST select one value for each):
  68. # # {mandatory_attr_text}
  69. # # CRITICAL INSTRUCTIONS:
  70. # # - Return ONLY valid JSON, nothing else
  71. # # - No explanations, no markdown, no text before or after the JSON
  72. # # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
  73. # # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  74. # # - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
  75. # # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
  76. # # - Be precise and only extract information that is explicitly stated or clearly implied
  77. # # Required Output Format (ONLY THIS, NO OTHER TEXT):
  78. # # {json.dumps(output_format, indent=2)}
  79. # # """
  80. # # payload = {
  81. # # "model": model,
  82. # # "messages": [
  83. # # {
  84. # # "role": "system",
  85. # # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  86. # # },
  87. # # {"role": "user", "content": prompt}
  88. # # ],
  89. # # "temperature": 0.0,
  90. # # "max_tokens": 1500
  91. # # }
  92. # # headers = {
  93. # # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  94. # # "Content-Type": "application/json",
  95. # # }
  96. # # try:
  97. # # response = requests.post(
  98. # # settings.GROQ_API_URL,
  99. # # headers=headers,
  100. # # json=payload,
  101. # # timeout=30
  102. # # )
  103. # # response.raise_for_status()
  104. # # result_text = response.json()["choices"][0]["message"]["content"].strip()
  105. # # # Clean the response
  106. # # result_text = ProductAttributeService._clean_json_response(result_text)
  107. # # # Parse JSON
  108. # # parsed = json.loads(result_text)
  109. # # # Validate and restructure if needed
  110. # # parsed = ProductAttributeService._validate_response_structure(
  111. # # parsed, mandatory_attrs, extract_additional
  112. # # )
  113. # # return parsed
  114. # # except requests.exceptions.RequestException as e:
  115. # # return ProductAttributeService._create_error_response(
  116. # # str(e), mandatory_attrs, extract_additional
  117. # # )
  118. # # except json.JSONDecodeError as e:
  119. # # return ProductAttributeService._create_error_response(
  120. # # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  121. # # )
  122. # # except Exception as e:
  123. # # return ProductAttributeService._create_error_response(
  124. # # str(e), mandatory_attrs, extract_additional
  125. # # )
  126. # # @staticmethod
  127. # # def _clean_json_response(text: str) -> str:
  128. # # """Clean LLM response to extract valid JSON."""
  129. # # start_idx = text.find('{')
  130. # # end_idx = text.rfind('}')
  131. # # if start_idx != -1 and end_idx != -1:
  132. # # text = text[start_idx:end_idx + 1]
  133. # # if "```json" in text:
  134. # # text = text.split("```json")[1].split("```")[0].strip()
  135. # # elif "```" in text:
  136. # # text = text.split("```")[1].split("```")[0].strip()
  137. # # if text.startswith("json"):
  138. # # text = text[4:].strip()
  139. # # return text
  140. # # @staticmethod
  141. # # def _validate_response_structure(
  142. # # parsed: dict,
  143. # # mandatory_attrs: Dict[str, List[str]],
  144. # # extract_additional: bool
  145. # # ) -> dict:
  146. # # """Validate and fix the response structure."""
  147. # # expected_sections = ["mandatory"]
  148. # # if extract_additional:
  149. # # expected_sections.append("additional")
  150. # # if not all(section in parsed for section in expected_sections):
  151. # # if isinstance(parsed, dict):
  152. # # mandatory_keys = set(mandatory_attrs.keys())
  153. # # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  154. # # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  155. # # result = {"mandatory": mandatory}
  156. # # if extract_additional:
  157. # # result["additional"] = additional
  158. # # return result
  159. # # else:
  160. # # return ProductAttributeService._create_error_response(
  161. # # "Invalid response structure",
  162. # # mandatory_attrs,
  163. # # extract_additional,
  164. # # str(parsed)
  165. # # )
  166. # # return parsed
  167. # # @staticmethod
  168. # # def _create_error_response(
  169. # # error: str,
  170. # # mandatory_attrs: Dict[str, List[str]],
  171. # # extract_additional: bool,
  172. # # raw_output: Optional[str] = None
  173. # # ) -> dict:
  174. # # """Create a standardized error response."""
  175. # # response = {
  176. # # "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  177. # # "error": error
  178. # # }
  179. # # if extract_additional:
  180. # # response["additional"] = {}
  181. # # if raw_output:
  182. # # response["raw_output"] = raw_output
  183. # # return response
  184. # import requests
  185. # import json
  186. # from typing import Dict, List, Optional
  187. # from django.conf import settings
  188. # from concurrent.futures import ThreadPoolExecutor, as_completed
  189. # class ProductAttributeService:
  190. # """Service class for extracting product attributes using Groq LLM."""
  191. # @staticmethod
  192. # def combine_product_text(
  193. # title: Optional[str] = None,
  194. # short_desc: Optional[str] = None,
  195. # long_desc: Optional[str] = None
  196. # ) -> str:
  197. # """Combine product metadata into a single text block."""
  198. # parts = []
  199. # if title:
  200. # parts.append(str(title).strip())
  201. # if short_desc:
  202. # parts.append(str(short_desc).strip())
  203. # if long_desc:
  204. # parts.append(str(long_desc).strip())
  205. # combined = " ".join(parts).strip()
  206. # if not combined:
  207. # return "No product information available"
  208. # return combined
  209. # @staticmethod
  210. # def extract_attributes(
  211. # product_text: str,
  212. # mandatory_attrs: Dict[str, List[str]],
  213. # model: str = None,
  214. # extract_additional: bool = True
  215. # ) -> dict:
  216. # """Use Groq LLM to extract attributes from any product type."""
  217. # if model is None:
  218. # model = settings.SUPPORTED_MODELS[0]
  219. # # Check if product text is empty or minimal
  220. # if not product_text or product_text == "No product information available":
  221. # return ProductAttributeService._create_error_response(
  222. # "No product information provided",
  223. # mandatory_attrs,
  224. # extract_additional
  225. # )
  226. # # Create structured prompt for mandatory attributes
  227. # mandatory_attr_list = []
  228. # for attr_name, allowed_values in mandatory_attrs.items():
  229. # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  230. # mandatory_attr_text = "\n".join(mandatory_attr_list)
  231. # additional_instruction = ""
  232. # if extract_additional:
  233. # additional_instruction = """
  234. # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  235. # (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.)
  236. # and their values. Extract attributes that are specific and relevant to this product type."""
  237. # output_format = {
  238. # "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
  239. # "additional": {} if extract_additional else None
  240. # }
  241. # if not extract_additional:
  242. # output_format.pop("additional")
  243. # prompt = f"""
  244. # You are an intelligent product attribute extractor that works with ANY product type.
  245. # TASK:
  246. # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value
  247. # from the provided list. Choose the value that best matches the product description.
  248. # {additional_instruction}
  249. # Product Text:
  250. # {product_text}
  251. # Mandatory Attribute Lists (MUST select one value for each):
  252. # {mandatory_attr_text}
  253. # CRITICAL INSTRUCTIONS:
  254. # - Return ONLY valid JSON, nothing else
  255. # - No explanations, no markdown, no text before or after the JSON
  256. # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
  257. # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  258. # - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
  259. # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
  260. # - Be precise and only extract information that is explicitly stated or clearly implied
  261. # Required Output Format (ONLY THIS, NO OTHER TEXT):
  262. # {json.dumps(output_format, indent=2)}
  263. # """
  264. # payload = {
  265. # "model": model,
  266. # "messages": [
  267. # {
  268. # "role": "system",
  269. # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  270. # },
  271. # {"role": "user", "content": prompt}
  272. # ],
  273. # "temperature": 0.0,
  274. # "max_tokens": 1500
  275. # }
  276. # headers = {
  277. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  278. # "Content-Type": "application/json",
  279. # }
  280. # try:
  281. # response = requests.post(
  282. # settings.GROQ_API_URL,
  283. # headers=headers,
  284. # json=payload,
  285. # timeout=30
  286. # )
  287. # response.raise_for_status()
  288. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  289. # # Clean the response
  290. # result_text = ProductAttributeService._clean_json_response(result_text)
  291. # # Parse JSON
  292. # parsed = json.loads(result_text)
  293. # # Validate and restructure if needed
  294. # parsed = ProductAttributeService._validate_response_structure(
  295. # parsed, mandatory_attrs, extract_additional
  296. # )
  297. # return parsed
  298. # except requests.exceptions.RequestException as e:
  299. # return ProductAttributeService._create_error_response(
  300. # str(e), mandatory_attrs, extract_additional
  301. # )
  302. # except json.JSONDecodeError as e:
  303. # return ProductAttributeService._create_error_response(
  304. # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  305. # )
  306. # except Exception as e:
  307. # return ProductAttributeService._create_error_response(
  308. # str(e), mandatory_attrs, extract_additional
  309. # )
  310. # @staticmethod
  311. # def extract_attributes_batch(
  312. # products: List[Dict],
  313. # mandatory_attrs: Dict[str, List[str]],
  314. # model: str = None,
  315. # extract_additional: bool = True,
  316. # max_workers: int = 5
  317. # ) -> Dict:
  318. # """
  319. # Extract attributes for multiple products in parallel.
  320. # Args:
  321. # products: List of product dictionaries with keys: product_id, title, short_desc, long_desc
  322. # mandatory_attrs: Dictionary of mandatory attributes
  323. # model: Groq model to use
  324. # extract_additional: Whether to extract additional attributes
  325. # max_workers: Maximum number of parallel workers
  326. # Returns:
  327. # Dictionary with results, total_products, successful, and failed counts
  328. # """
  329. # results = []
  330. # successful = 0
  331. # failed = 0
  332. # def process_product(product_data):
  333. # """Process a single product."""
  334. # product_id = product_data.get('product_id', f"product_{len(results)}")
  335. # try:
  336. # product_text = ProductAttributeService.combine_product_text(
  337. # title=product_data.get('title'),
  338. # short_desc=product_data.get('short_desc'),
  339. # long_desc=product_data.get('long_desc')
  340. # )
  341. # result = ProductAttributeService.extract_attributes(
  342. # product_text=product_text,
  343. # mandatory_attrs=mandatory_attrs,
  344. # model=model,
  345. # extract_additional=extract_additional
  346. # )
  347. # result['product_id'] = product_id
  348. # # Check if extraction was successful
  349. # if 'error' not in result:
  350. # return result, True
  351. # else:
  352. # return result, False
  353. # except Exception as e:
  354. # return {
  355. # 'product_id': product_id,
  356. # 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  357. # 'additional': {} if extract_additional else None,
  358. # 'error': f"Processing error: {str(e)}"
  359. # }, False
  360. # # Process products in parallel
  361. # with ThreadPoolExecutor(max_workers=max_workers) as executor:
  362. # future_to_product = {
  363. # executor.submit(process_product, product): product
  364. # for product in products
  365. # }
  366. # for future in as_completed(future_to_product):
  367. # try:
  368. # result, success = future.result()
  369. # results.append(result)
  370. # if success:
  371. # successful += 1
  372. # else:
  373. # failed += 1
  374. # except Exception as e:
  375. # failed += 1
  376. # results.append({
  377. # 'product_id': 'unknown',
  378. # 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  379. # 'additional': {} if extract_additional else None,
  380. # 'error': f"Unexpected error: {str(e)}"
  381. # })
  382. # return {
  383. # 'results': results,
  384. # 'total_products': len(products),
  385. # 'successful': successful,
  386. # 'failed': failed
  387. # }
  388. # @staticmethod
  389. # def _clean_json_response(text: str) -> str:
  390. # """Clean LLM response to extract valid JSON."""
  391. # start_idx = text.find('{')
  392. # end_idx = text.rfind('}')
  393. # if start_idx != -1 and end_idx != -1:
  394. # text = text[start_idx:end_idx + 1]
  395. # if "```json" in text:
  396. # text = text.split("```json")[1].split("```")[0].strip()
  397. # elif "```" in text:
  398. # text = text.split("```")[1].split("```")[0].strip()
  399. # if text.startswith("json"):
  400. # text = text[4:].strip()
  401. # return text
  402. # @staticmethod
  403. # def _validate_response_structure(
  404. # parsed: dict,
  405. # mandatory_attrs: Dict[str, List[str]],
  406. # extract_additional: bool
  407. # ) -> dict:
  408. # """Validate and fix the response structure."""
  409. # expected_sections = ["mandatory"]
  410. # if extract_additional:
  411. # expected_sections.append("additional")
  412. # if not all(section in parsed for section in expected_sections):
  413. # if isinstance(parsed, dict):
  414. # mandatory_keys = set(mandatory_attrs.keys())
  415. # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  416. # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  417. # result = {"mandatory": mandatory}
  418. # if extract_additional:
  419. # result["additional"] = additional
  420. # return result
  421. # else:
  422. # return ProductAttributeService._create_error_response(
  423. # "Invalid response structure",
  424. # mandatory_attrs,
  425. # extract_additional,
  426. # str(parsed)
  427. # )
  428. # return parsed
  429. # @staticmethod
  430. # def _create_error_response(
  431. # error: str,
  432. # mandatory_attrs: Dict[str, List[str]],
  433. # extract_additional: bool,
  434. # raw_output: Optional[str] = None
  435. # ) -> dict:
  436. # """Create a standardized error response."""
  437. # response = {
  438. # "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  439. # "error": error
  440. # }
  441. # if extract_additional:
  442. # response["additional"] = {}
  443. # if raw_output:
  444. # response["raw_output"] = raw_output
  445. # return response
  446. # ==================== services.py ====================
  447. import requests
  448. import json
  449. from typing import Dict, List, Optional
  450. from django.conf import settings
  451. from concurrent.futures import ThreadPoolExecutor, as_completed
  452. from .ocr_service import OCRService
  453. class ProductAttributeService:
  454. """Service class for extracting product attributes using Groq LLM."""
  455. @staticmethod
  456. def combine_product_text(
  457. title: Optional[str] = None,
  458. short_desc: Optional[str] = None,
  459. long_desc: Optional[str] = None,
  460. ocr_text: Optional[str] = None
  461. ) -> str:
  462. """Combine product metadata into a single text block."""
  463. parts = []
  464. if title:
  465. parts.append(f"Title: {str(title).strip()}")
  466. if short_desc:
  467. parts.append(f"Description: {str(short_desc).strip()}")
  468. if long_desc:
  469. parts.append(f"Details: {str(long_desc).strip()}")
  470. if ocr_text:
  471. parts.append(f"OCR Text: {ocr_text}")
  472. combined = "\n".join(parts).strip()
  473. if not combined:
  474. return "No product information available"
  475. return combined
  476. @staticmethod
  477. def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  478. """Extract structured attributes from OCR text using LLM."""
  479. if model is None:
  480. model = settings.SUPPORTED_MODELS[0]
  481. detected_text = ocr_results.get('detected_text', [])
  482. if not detected_text:
  483. return {}
  484. # Format OCR text for prompt
  485. ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  486. for item in detected_text])
  487. prompt = f"""
  488. You are an AI model that extracts structured attributes from OCR text detected on product images.
  489. Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  490. OCR Text:
  491. {ocr_text}
  492. Extract relevant attributes like:
  493. - brand
  494. - model_number
  495. - size (waist_size, length, etc.)
  496. - collection
  497. - any other relevant product information
  498. Return a JSON object with only the attributes you can confidently identify.
  499. If an attribute is not present, do not include it in the response.
  500. """
  501. payload = {
  502. "model": model,
  503. "messages": [
  504. {
  505. "role": "system",
  506. "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  507. },
  508. {"role": "user", "content": prompt}
  509. ],
  510. "temperature": 0.2,
  511. "max_tokens": 500
  512. }
  513. headers = {
  514. "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  515. "Content-Type": "application/json",
  516. }
  517. try:
  518. response = requests.post(
  519. settings.GROQ_API_URL,
  520. headers=headers,
  521. json=payload,
  522. timeout=30
  523. )
  524. response.raise_for_status()
  525. result_text = response.json()["choices"][0]["message"]["content"].strip()
  526. # Clean and parse JSON
  527. result_text = ProductAttributeService._clean_json_response(result_text)
  528. parsed = json.loads(result_text)
  529. return parsed
  530. except Exception as e:
  531. return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  532. @staticmethod
  533. def extract_attributes(
  534. product_text: str,
  535. mandatory_attrs: Dict[str, List[str]],
  536. model: str = None,
  537. extract_additional: bool = True
  538. ) -> dict:
  539. """Use Groq LLM to extract attributes from any product type."""
  540. if model is None:
  541. model = settings.SUPPORTED_MODELS[0]
  542. # Check if product text is empty or minimal
  543. if not product_text or product_text == "No product information available":
  544. return ProductAttributeService._create_error_response(
  545. "No product information provided",
  546. mandatory_attrs,
  547. extract_additional
  548. )
  549. # Create structured prompt for mandatory attributes
  550. mandatory_attr_list = []
  551. for attr_name, allowed_values in mandatory_attrs.items():
  552. mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  553. mandatory_attr_text = "\n".join(mandatory_attr_list)
  554. additional_instruction = ""
  555. if extract_additional:
  556. additional_instruction = """
  557. 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  558. (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.)
  559. and their values. Extract attributes that are specific and relevant to this product type."""
  560. output_format = {
  561. "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
  562. "additional": {} if extract_additional else None
  563. }
  564. if not extract_additional:
  565. output_format.pop("additional")
  566. prompt = f"""
  567. You are an intelligent product attribute extractor that works with ANY product type.
  568. TASK:
  569. 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value
  570. from the provided list. Choose the value that best matches the product description.
  571. {additional_instruction}
  572. Product Text:
  573. {product_text}
  574. Mandatory Attribute Lists (MUST select one value for each):
  575. {mandatory_attr_text}
  576. CRITICAL INSTRUCTIONS:
  577. - Return ONLY valid JSON, nothing else
  578. - No explanations, no markdown, no text before or after the JSON
  579. - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
  580. - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  581. - Work with whatever information is available - the product text may be incomplete
  582. {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
  583. - Be precise and only extract information that is explicitly stated or clearly implied
  584. Required Output Format (ONLY THIS, NO OTHER TEXT):
  585. {json.dumps(output_format, indent=2)}
  586. """
  587. payload = {
  588. "model": model,
  589. "messages": [
  590. {
  591. "role": "system",
  592. "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  593. },
  594. {"role": "user", "content": prompt}
  595. ],
  596. "temperature": 0.0,
  597. "max_tokens": 1500
  598. }
  599. headers = {
  600. "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  601. "Content-Type": "application/json",
  602. }
  603. try:
  604. response = requests.post(
  605. settings.GROQ_API_URL,
  606. headers=headers,
  607. json=payload,
  608. timeout=30
  609. )
  610. response.raise_for_status()
  611. result_text = response.json()["choices"][0]["message"]["content"].strip()
  612. # Clean the response
  613. result_text = ProductAttributeService._clean_json_response(result_text)
  614. # Parse JSON
  615. parsed = json.loads(result_text)
  616. # Validate and restructure if needed
  617. parsed = ProductAttributeService._validate_response_structure(
  618. parsed, mandatory_attrs, extract_additional
  619. )
  620. return parsed
  621. except requests.exceptions.RequestException as e:
  622. return ProductAttributeService._create_error_response(
  623. str(e), mandatory_attrs, extract_additional
  624. )
  625. except json.JSONDecodeError as e:
  626. return ProductAttributeService._create_error_response(
  627. f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  628. )
  629. except Exception as e:
  630. return ProductAttributeService._create_error_response(
  631. str(e), mandatory_attrs, extract_additional
  632. )
  633. @staticmethod
  634. def extract_attributes_batch(
  635. products: List[Dict],
  636. mandatory_attrs: Dict[str, List[str]],
  637. model: str = None,
  638. extract_additional: bool = True,
  639. process_image: bool = True,
  640. max_workers: int = 5
  641. ) -> Dict:
  642. """Extract attributes for multiple products in parallel."""
  643. results = []
  644. successful = 0
  645. failed = 0
  646. ocr_service = OCRService()
  647. def process_product(product_data):
  648. """Process a single product."""
  649. product_id = product_data.get('product_id', f"product_{len(results)}")
  650. try:
  651. # Process image if URL is provided
  652. ocr_results = None
  653. ocr_text = None
  654. if process_image and product_data.get('image_url'):
  655. ocr_results = ocr_service.process_image(product_data['image_url'])
  656. # Extract attributes from OCR
  657. if ocr_results and ocr_results.get('detected_text'):
  658. ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
  659. ocr_results, model
  660. )
  661. ocr_results['extracted_attributes'] = ocr_attrs
  662. # Format OCR text for combining with product text
  663. ocr_text = "\n".join([
  664. f"{item['text']} (confidence: {item['confidence']:.2f})"
  665. for item in ocr_results['detected_text']
  666. ])
  667. # Combine all product information
  668. product_text = ProductAttributeService.combine_product_text(
  669. title=product_data.get('title'),
  670. short_desc=product_data.get('short_desc'),
  671. long_desc=product_data.get('long_desc'),
  672. ocr_text=ocr_text
  673. )
  674. # Extract attributes from combined text
  675. result = ProductAttributeService.extract_attributes(
  676. product_text=product_text,
  677. mandatory_attrs=mandatory_attrs,
  678. model=model,
  679. extract_additional=extract_additional
  680. )
  681. result['product_id'] = product_id
  682. # Add OCR results if available
  683. if ocr_results:
  684. result['ocr_results'] = ocr_results
  685. # Check if extraction was successful
  686. if 'error' not in result:
  687. return result, True
  688. else:
  689. return result, False
  690. except Exception as e:
  691. return {
  692. 'product_id': product_id,
  693. 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  694. 'additional': {} if extract_additional else None,
  695. 'error': f"Processing error: {str(e)}"
  696. }, False
  697. # Process products in parallel
  698. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  699. future_to_product = {
  700. executor.submit(process_product, product): product
  701. for product in products
  702. }
  703. for future in as_completed(future_to_product):
  704. try:
  705. result, success = future.result()
  706. results.append(result)
  707. if success:
  708. successful += 1
  709. else:
  710. failed += 1
  711. except Exception as e:
  712. failed += 1
  713. results.append({
  714. 'product_id': 'unknown',
  715. 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  716. 'additional': {} if extract_additional else None,
  717. 'error': f"Unexpected error: {str(e)}"
  718. })
  719. return {
  720. 'results': results,
  721. 'total_products': len(products),
  722. 'successful': successful,
  723. 'failed': failed
  724. }
  725. @staticmethod
  726. def _clean_json_response(text: str) -> str:
  727. """Clean LLM response to extract valid JSON."""
  728. start_idx = text.find('{')
  729. end_idx = text.rfind('}')
  730. if start_idx != -1 and end_idx != -1:
  731. text = text[start_idx:end_idx + 1]
  732. if "```json" in text:
  733. text = text.split("```json")[1].split("```")[0].strip()
  734. elif "```" in text:
  735. text = text.split("```")[1].split("```")[0].strip()
  736. if text.startswith("json"):
  737. text = text[4:].strip()
  738. return text
  739. @staticmethod
  740. def _validate_response_structure(
  741. parsed: dict,
  742. mandatory_attrs: Dict[str, List[str]],
  743. extract_additional: bool
  744. ) -> dict:
  745. """Validate and fix the response structure."""
  746. expected_sections = ["mandatory"]
  747. if extract_additional:
  748. expected_sections.append("additional")
  749. if not all(section in parsed for section in expected_sections):
  750. if isinstance(parsed, dict):
  751. mandatory_keys = set(mandatory_attrs.keys())
  752. mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  753. additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  754. result = {"mandatory": mandatory}
  755. if extract_additional:
  756. result["additional"] = additional
  757. return result
  758. else:
  759. return ProductAttributeService._create_error_response(
  760. "Invalid response structure",
  761. mandatory_attrs,
  762. extract_additional,
  763. str(parsed)
  764. )
  765. return parsed
  766. @staticmethod
  767. def _create_error_response(
  768. error: str,
  769. mandatory_attrs: Dict[str, List[str]],
  770. extract_additional: bool,
  771. raw_output: Optional[str] = None
  772. ) -> dict:
  773. """Create a standardized error response."""
  774. response = {
  775. "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  776. "error": error
  777. }
  778. if extract_additional:
  779. response["additional"] = {}
  780. if raw_output:
  781. response["raw_output"] = raw_output
  782. return response