services.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. ################## VERSION WORKING GOOD, BUT COMMENTING TO GET MULTIPLE VALUES WITH SEMANTIC MATCH ALSO #################
  2. # import json
  3. # import hashlib
  4. # import logging
  5. # import time
  6. # from functools import wraps
  7. # from typing import Dict, List, Optional, Tuple
  8. # import requests
  9. # from django.conf import settings
  10. # from .llm_load_balancer import call_llm_with_load_balancer
  11. # from .cache_config import (
  12. # is_caching_enabled,
  13. # ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  14. # ATTRIBUTE_CACHE_MAX_SIZE,
  15. # )
  16. # logger = logging.getLogger(__name__)
  17. # # --------------------------------------------------------------------------- #
  18. # # CACHES
  19. # # --------------------------------------------------------------------------- #
  20. # class SimpleCache:
  21. # _cache = {}
  22. # _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  23. # @classmethod
  24. # def get(cls, key: str) -> Optional[Dict]:
  25. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  26. # return cls._cache.get(key)
  27. # @classmethod
  28. # def set(cls, key: str, value: Dict):
  29. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  30. # if len(cls._cache) >= cls._max_size:
  31. # items = list(cls._cache.items())
  32. # cls._cache = dict(items[int(cls._max_size * 0.2):])
  33. # cls._cache[key] = value
  34. # @classmethod
  35. # def clear(cls): cls._cache.clear()
  36. # @classmethod
  37. # def get_stats(cls) -> Dict:
  38. # return {
  39. # "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  40. # "size": len(cls._cache),
  41. # "max_size": cls._max_size,
  42. # "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  43. # }
  44. # # --------------------------------------------------------------------------- #
  45. # # RETRY DECORATOR
  46. # # --------------------------------------------------------------------------- #
  47. # def retry(max_attempts=3, delay=1.0):
  48. # def decorator(f):
  49. # @wraps(f)
  50. # def wrapper(*args, **kwargs):
  51. # last_exc = None
  52. # for i in range(max_attempts):
  53. # try:
  54. # return f(*args, **kwargs)
  55. # except Exception as e:
  56. # last_exc = e
  57. # if i < max_attempts - 1:
  58. # wait = delay * (2 ** i)
  59. # logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  60. # time.sleep(wait)
  61. # raise last_exc or RuntimeError("Retry failed")
  62. # return wrapper
  63. # return decorator
  64. # # --------------------------------------------------------------------------- #
  65. # # MAIN SERVICE
  66. # # --------------------------------------------------------------------------- #
  67. # class ProductAttributeService:
  68. # @staticmethod
  69. # def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  70. # parts = []
  71. # source_map = {}
  72. # if title:
  73. # t = str(title).strip()
  74. # parts.append(f"Title: {t}")
  75. # source_map["title"] = t
  76. # if short_desc:
  77. # s = str(short_desc).strip()
  78. # parts.append(f"Description: {s}")
  79. # source_map["short_desc"] = s
  80. # if long_desc:
  81. # l = str(long_desc).strip()
  82. # parts.append(f"Details: {l}")
  83. # source_map["long_desc"] = l
  84. # if ocr_text:
  85. # parts.append(f"OCR Text: {ocr_text}")
  86. # source_map["ocr_text"] = ocr_text
  87. # combined = "\n".join(parts).strip()
  88. # return (combined or "No product information", source_map)
  89. # @staticmethod
  90. # def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str], user_values: Dict = None) -> str:
  91. # payload = {
  92. # "text": product_text,
  93. # "attrs": mandatory_attrs,
  94. # "extra": extract_additional,
  95. # "multiple": sorted(multiple),
  96. # "user_values": user_values or {}
  97. # }
  98. # return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  99. # @staticmethod
  100. # def _clean_json(text: str) -> str:
  101. # start = text.find("{")
  102. # end = text.rfind("}") + 1
  103. # if start != -1 and end > start:
  104. # text = text[start:end]
  105. # if "```json" in text:
  106. # text = text.split("```json", 1)[1].split("```", 1)[0]
  107. # elif "```" in text:
  108. # text = text.split("```", 1)[1].split("```", 1)[0]
  109. # if text.lstrip().startswith("json"): text = text[4:]
  110. # return text.strip()
  111. # @staticmethod
  112. # def format_visual_attributes(visual_attributes: Dict) -> Dict:
  113. # formatted = {}
  114. # for key, value in visual_attributes.items():
  115. # if isinstance(value, list):
  116. # formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  117. # elif isinstance(value, dict):
  118. # nested = {}
  119. # for sub_key, sub_val in value.items():
  120. # if isinstance(sub_val, list):
  121. # nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  122. # else:
  123. # nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  124. # formatted[key] = nested
  125. # else:
  126. # formatted[key] = [{"value": str(value), "source": "image"}]
  127. # return formatted
  128. # # @staticmethod
  129. # # @retry(max_attempts=3, delay=1.0)
  130. # # def _call_llm(payload: dict) -> str:
  131. # # headers = {"Authorization": f"Bearer {settings.GROQ_API_KEY}", "Content-Type": "application/json"}
  132. # # resp = requests.post(settings.GROQ_API_URL, headers=headers, json=payload, timeout=30)
  133. # # resp.raise_for_status()
  134. # # return resp.json()["choices"][0]["message"]["content"]
  135. # # At the top of services.py, add this import
  136. # # from . import call_llm_with_load_balancer, get_load_balancer_stats
  137. # # Replace the existing _call_llm method with this:
  138. # @staticmethod
  139. # @retry(max_attempts=3, delay=3.0)
  140. # def _call_llm(payload: dict) -> str:
  141. # """
  142. # Call LLM using load balancer with multiple API keys
  143. # Automatically handles rate limiting and failover
  144. # """
  145. # return call_llm_with_load_balancer(payload)
  146. # @staticmethod
  147. # def extract_attributes(
  148. # product_text: str,
  149. # mandatory_attrs: Dict[str, List[str]],
  150. # source_map: Dict[str, str] = None,
  151. # model: str = None,
  152. # extract_additional: bool = True,
  153. # multiple: Optional[List[str]] = None,
  154. # use_cache: Optional[bool] = None,
  155. # user_entered_values: Optional[Dict[str, str]] = None, # NEW PARAMETER
  156. # ) -> dict:
  157. # if model is None: model = settings.SUPPORTED_MODELS[0]
  158. # if multiple is None: multiple = []
  159. # if source_map is None: source_map = {}
  160. # if user_entered_values is None: user_entered_values = {}
  161. # if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  162. # if not is_caching_enabled(): use_cache = False
  163. # cache_key = None
  164. # if use_cache:
  165. # cache_key = ProductAttributeService._cache_key(
  166. # product_text, mandatory_attrs, extract_additional, multiple, user_entered_values
  167. # )
  168. # cached = SimpleCache.get(cache_key)
  169. # if cached:
  170. # logger.info(f"CACHE HIT {cache_key[:16]}...")
  171. # return cached
  172. # # --------------------------- BUILD USER VALUES SECTION ---------------------------
  173. # user_values_section = ""
  174. # if user_entered_values:
  175. # user_lines = []
  176. # for attr, value in user_entered_values.items():
  177. # user_lines.append(f" - {attr}: {value}")
  178. # user_values_section = f"""
  179. # USER MANUALLY ENTERED VALUES:
  180. # {chr(10).join(user_lines)}
  181. # IMPORTANT INSTRUCTIONS FOR USER VALUES:
  182. # 1. Choose the BEST value (could be user's value, or from allowed list, or inferred)
  183. # 2. Always provide a "reason" field explaining your decision. Your reason should be valid and from the product text. Not always exact word to be matched from the product text, you can infer understanding the product text.
  184. # 3. DO NOT hallucinate - be honest if user's value seems wrong based on product evidence
  185. # 4. If user's value is not in the allowed list but seems correct, chose the most nearest value from the allowed list with proper reasoning why it chose it. Also in this case give the most apt value that should be added in the possible list in the reason so that user can edit it later.
  186. # """
  187. # # --------------------------- PROMPT ---------------------------
  188. # allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  189. # allowed_text = "\n".join(allowed_lines)
  190. # allowed_sources = list(source_map.keys()) + ["title", "description", "inferred"]
  191. # source_hint = "|".join(allowed_sources)
  192. # multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  193. # if extract_additional:
  194. # additional_instructions = """
  195. # For the 'additional' section, identify any other important product attributes and their values (e.g., 'Color', 'Material', 'Weight' etc according to the product text) that are present in the PRODUCT TEXT but not in the Mandatory Attribute list.
  196. # For each additional attribute, use the best available value from the PRODUCT TEXT and specify the 'source'.
  197. # Strictly Extract other key attributes other than mandatory attributes from the text.
  198. # """
  199. # output_example_additional = """
  200. # "additional": {
  201. # "Additional_Attr_1": [{
  202. # "value": "Value 1",
  203. # "source": "<{source_hint}>",
  204. # "reason": "Why this attribute and value were identified"
  205. # }]
  206. # }
  207. # """
  208. # else:
  209. # additional_instructions = """
  210. # Do not identify or include any additional attributes. The 'additional' section must be an empty object {}.
  211. # """
  212. # output_example_additional = ' "additional": {}'
  213. # prompt = f"""
  214. # You are a product-attribute classifier and validator.
  215. # Understand the product text very deeply. If the same product is available somewhere online, use that knowledge to predict accurate attribute values.
  216. # Do not depend only on word-by-word matching from the product text - interpret the meaning and suggest attributes intelligently.
  217. # Pick the *closest meaning* value from the allowed list, even if not an exact word match.
  218. # I want values for all mandatory attributes.
  219. # If a value is not found anywhere, the source should be "inferred".
  220. # Note: Source means from where you have concluded the result. Choose one of these value <{source_hint}>
  221. # ALLOWED VALUES (MANDATORY):
  222. # {allowed_text}
  223. # Note: "Strictly" return multiple values for these attributes: {multiple_text}. These values must be most possible values from the list and should be max 2 values.
  224. # {user_values_section}
  225. # {additional_instructions}
  226. # PRODUCT TEXT:
  227. # {product_text}
  228. # OUTPUT (strict JSON only):
  229. # {{
  230. # "mandatory": {{
  231. # "<attr>": [{{
  232. # "value": "<chosen_value>",
  233. # "source": "<{source_hint}>",
  234. # "reason": "Explanation of why this value was chosen. If user provided a value, explain why you agreed/disagreed with it.",
  235. # "original_value": "<user_entered_value_if_provided>",
  236. # "decision": "accepted|rejected|not_provided"
  237. # }}]
  238. # }},
  239. # {output_example_additional}
  240. # }}
  241. # RULES:
  242. # - For each mandatory attribute with a user-entered value, include "original_value" and "decision" fields
  243. # - "decision" values: "accepted" (used user's value), "rejected" (used different value), "not_provided" (no user value given)
  244. # - "reason" must explain your choice, especially when rejecting user input
  245. # - For 'multiple' attributes, always give multiple values for those attributes, choose wisely and max 2 values per attribute that are very close.
  246. # - Source must be one of: {source_hint}
  247. # - Be honest and specific in your reasoning.
  248. # - Return ONLY valid JSON
  249. # """
  250. # payload = {
  251. # "model": model,
  252. # "messages": [
  253. # {"role": "system", "content": "You are a JSON-only extractor and validator. Always provide clear reasoning for your decisions."},
  254. # {"role": "user", "content": prompt},
  255. # ],
  256. # "temperature": 0.3,
  257. # "max_tokens": 2000, # Increased for reasoning
  258. # }
  259. # try:
  260. # raw = ProductAttributeService._call_llm(payload)
  261. # logger.info("Raw LLM response received")
  262. # cleaned = ProductAttributeService._clean_json(raw)
  263. # parsed = json.loads(cleaned)
  264. # except Exception as exc:
  265. # logger.error(f"LLM failed: {exc}")
  266. # return {
  267. # "mandatory": {
  268. # a: [{
  269. # "value": "Not Specified",
  270. # "source": "llm_error",
  271. # "reason": f"LLM processing failed: {str(exc)}"
  272. # }] for a in mandatory_attrs
  273. # },
  274. # "additional": {} if not extract_additional else {},
  275. # "error": str(exc)
  276. # }
  277. # if use_cache and cache_key:
  278. # SimpleCache.set(cache_key, parsed)
  279. # logger.info(f"CACHE SET {cache_key[:16]}...")
  280. # return parsed
  281. # @staticmethod
  282. # def get_cache_stats() -> Dict:
  283. # return {
  284. # "global_enabled": is_caching_enabled(),
  285. # "result_cache": SimpleCache.get_stats(),
  286. # }
  287. # @staticmethod
  288. # def clear_all_caches():
  289. # SimpleCache.clear()
  290. # logger.info("All caches cleared")
  291. ################## EDITING PROMPT OF ABOVE VERSION ONLY #################
  292. import json
  293. import hashlib
  294. import logging
  295. import time
  296. from functools import wraps
  297. from typing import Dict, List, Optional, Tuple
  298. import requests
  299. from django.conf import settings
  300. from .llm_load_balancer import call_llm_with_load_balancer
  301. from .cache_config import (
  302. is_caching_enabled,
  303. ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  304. ATTRIBUTE_CACHE_MAX_SIZE,
  305. )
  306. logger = logging.getLogger(__name__)
  307. # --------------------------------------------------------------------------- #
  308. # CACHES
  309. # --------------------------------------------------------------------------- #
  310. class SimpleCache:
  311. _cache = {}
  312. _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  313. @classmethod
  314. def get(cls, key: str) -> Optional[Dict]:
  315. if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  316. return cls._cache.get(key)
  317. @classmethod
  318. def set(cls, key: str, value: Dict):
  319. if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  320. if len(cls._cache) >= cls._max_size:
  321. items = list(cls._cache.items())
  322. cls._cache = dict(items[int(cls._max_size * 0.2):])
  323. cls._cache[key] = value
  324. @classmethod
  325. def clear(cls): cls._cache.clear()
  326. @classmethod
  327. def get_stats(cls) -> Dict:
  328. return {
  329. "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  330. "size": len(cls._cache),
  331. "max_size": cls._max_size,
  332. "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  333. }
  334. # --------------------------------------------------------------------------- #
  335. # RETRY DECORATOR
  336. # --------------------------------------------------------------------------- #
  337. def retry(max_attempts=3, delay=1.0):
  338. def decorator(f):
  339. @wraps(f)
  340. def wrapper(*args, **kwargs):
  341. last_exc = None
  342. for i in range(max_attempts):
  343. try:
  344. return f(*args, **kwargs)
  345. except Exception as e:
  346. last_exc = e
  347. if i < max_attempts - 1:
  348. wait = delay * (2 ** i)
  349. logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  350. time.sleep(wait)
  351. raise last_exc or RuntimeError("Retry failed")
  352. return wrapper
  353. return decorator
  354. # --------------------------------------------------------------------------- #
  355. # MAIN SERVICE
  356. # --------------------------------------------------------------------------- #
  357. class ProductAttributeService:
  358. @staticmethod
  359. def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  360. parts = []
  361. source_map = {}
  362. if title:
  363. t = str(title).strip()
  364. parts.append(f"Title: {t}")
  365. source_map["title"] = t
  366. if short_desc:
  367. s = str(short_desc).strip()
  368. parts.append(f"Description: {s}")
  369. source_map["short_desc"] = s
  370. if long_desc:
  371. l = str(long_desc).strip()
  372. parts.append(f"Details: {l}")
  373. source_map["long_desc"] = l
  374. if ocr_text:
  375. parts.append(f"OCR Text: {ocr_text}")
  376. source_map["ocr_text"] = ocr_text
  377. combined = "\n".join(parts).strip()
  378. return (combined or "No product information", source_map)
  379. @staticmethod
  380. def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str], user_values: Dict = None) -> str:
  381. payload = {
  382. "text": product_text,
  383. "attrs": mandatory_attrs,
  384. "extra": extract_additional,
  385. "multiple": sorted(multiple),
  386. "user_values": user_values or {}
  387. }
  388. return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  389. # @staticmethod
  390. # def _clean_json(text: str) -> str:
  391. # start = text.find("{")
  392. # end = text.rfind("}") + 1
  393. # if start != -1 and end > start:
  394. # text = text[start:end]
  395. # if "```json" in text:
  396. # text = text.split("```json", 1)[1].split("```", 1)[0]
  397. # elif "```" in text:
  398. # text = text.split("```", 1)[1].split("```", 1)[0]
  399. # if text.lstrip().startswith("json"): text = text[4:]
  400. # return text.strip()
  401. @staticmethod
  402. def _clean_json(text: str) -> str:
  403. text = text.strip()
  404. # Extract JSON block if wrapped in ```json or ```
  405. if "```json" in text:
  406. text = text.split("```json", 1)[1].split("```", 1)[0]
  407. elif "```" in text:
  408. parts = text.split("```", 2)
  409. if len(parts) > 1:
  410. text = parts[1]
  411. # Find first { and last }
  412. start = text.find("{")
  413. end = text.rfind("}") + 1
  414. if start == -1 or end <= start:
  415. raise ValueError("No JSON object found in LLM response")
  416. text = text[start:end]
  417. return text.strip()
  418. @staticmethod
  419. def format_visual_attributes(visual_attributes: Dict) -> Dict:
  420. formatted = {}
  421. for key, value in visual_attributes.items():
  422. if isinstance(value, list):
  423. formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  424. elif isinstance(value, dict):
  425. nested = {}
  426. for sub_key, sub_val in value.items():
  427. if isinstance(sub_val, list):
  428. nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  429. else:
  430. nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  431. formatted[key] = nested
  432. else:
  433. formatted[key] = [{"value": str(value), "source": "image"}]
  434. return formatted
  435. # @staticmethod
  436. # @retry(max_attempts=3, delay=1.0)
  437. # def _call_llm(payload: dict) -> str:
  438. # headers = {"Authorization": f"Bearer {settings.GROQ_API_KEY}", "Content-Type": "application/json"}
  439. # resp = requests.post(settings.GROQ_API_URL, headers=headers, json=payload, timeout=30)
  440. # resp.raise_for_status()
  441. # return resp.json()["choices"][0]["message"]["content"]
  442. # At the top of services.py, add this import
  443. # from . import call_llm_with_load_balancer, get_load_balancer_stats
  444. # Replace the existing _call_llm method with this:
  445. @staticmethod
  446. @retry(max_attempts=3, delay=3.0)
  447. def _call_llm(payload: dict) -> str:
  448. """
  449. Call LLM using load balancer with multiple API keys
  450. Automatically handles rate limiting and failover
  451. """
  452. return call_llm_with_load_balancer(payload)
  453. @staticmethod
  454. def extract_attributes(
  455. product_text: str,
  456. mandatory_attrs: Dict[str, List[str]],
  457. source_map: Dict[str, str] = None,
  458. model: str = None,
  459. extract_additional: bool = True,
  460. multiple: Optional[List[str]] = None,
  461. use_cache: Optional[bool] = None,
  462. user_entered_values: Optional[Dict[str, str]] = None, # NEW PARAMETER
  463. ) -> dict:
  464. if model is None: model = settings.SUPPORTED_MODELS[0]
  465. if multiple is None: multiple = []
  466. if source_map is None: source_map = {}
  467. if user_entered_values is None: user_entered_values = {}
  468. if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  469. if not is_caching_enabled(): use_cache = False
  470. cache_key = None
  471. if use_cache:
  472. cache_key = ProductAttributeService._cache_key(
  473. product_text, mandatory_attrs, extract_additional, multiple, user_entered_values
  474. )
  475. cached = SimpleCache.get(cache_key)
  476. if cached:
  477. logger.info(f"CACHE HIT {cache_key[:16]}...")
  478. return cached
  479. # --------------------------- BUILD USER VALUES SECTION ---------------------------
  480. user_values_section = ""
  481. if user_entered_values:
  482. user_lines = []
  483. for attr, value in user_entered_values.items():
  484. user_lines.append(f" - {attr}: {value}")
  485. user_values_section = f"""
  486. USER MANUALLY ENTERED VALUES:
  487. {chr(10).join(user_lines)}
  488. IMPORTANT INSTRUCTIONS FOR USER VALUES:
  489. 1. Choose the BEST value (could be user's value, or from allowed list, or inferred)
  490. 2. Always provide a "reason" field explaining your decision. Your reason should be valid and from the product text. Not always exact word to be matched from the product text, you can infer understanding the product text.
  491. 3. DO NOT hallucinate - be honest if user's value seems wrong based on product evidence
  492. 4. If user's value is not in the allowed list but seems correct, chose the most nearest value from the allowed list with proper reasoning why it chose it. Also in this case give the most apt value that should be added in the possible list in the reason so that user can edit it later.
  493. """
  494. # --------------------------- PROMPT ---------------------------
  495. allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  496. allowed_text = "\n".join(allowed_lines)
  497. print(f"allowed text is {allowed_text}")
  498. allowed_sources = list(source_map.keys()) + ["title", "description", "inferred"]
  499. print(f"allowed_sources is {allowed_sources}")
  500. source_hint = "|".join(allowed_sources)
  501. multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  502. if extract_additional:
  503. additional_instructions = """
  504. For the 'additional' section, identify any other important product attributes and their values (e.g., 'Color', 'Material', 'Weight' etc according to the product text) that are present in the PRODUCT TEXT but not in the Mandatory Attribute list.
  505. For each additional attribute, use the best available value from the PRODUCT TEXT and specify the 'source'.
  506. Strictly Extract other key attributes other than mandatory attributes from the text.
  507. """
  508. output_example_additional = """
  509. "additional": {
  510. "Additional_Attr_1": [{
  511. "value": "Value 1",
  512. "source": "<{source_hint}>",
  513. "reason": "Why this attribute and value were identified"
  514. }]
  515. }
  516. """
  517. else:
  518. additional_instructions = """
  519. Do not identify or include any additional attributes. The 'additional' section must be an empty object {}.
  520. """
  521. output_example_additional = ' "additional": {}'
  522. prompt = f"""
  523. You are a product-attribute classifier and validator.
  524. Understand the product text very deeply. If the same product is available somewhere online, use that knowledge to predict accurate attribute values.
  525. Do not depend only on word-by-word matching from the product text - interpret the meaning and suggest attributes intelligently.
  526. Pick the *closest meaning* value from the allowed list, even if not an exact word match.
  527. I want values for all mandatory attributes.
  528. If a value is not found anywhere, the source should be "inferred".
  529. Note: Source means from where you have concluded the result. Choose one of these value <{source_hint}>
  530. Do not give "Condition" attribute ever.
  531. ALLOWED VALUES (MANDATORY):
  532. {allowed_text}
  533. Note: "Strictly" return multiple values for these attributes: {multiple_text}. These values must be most possible values from the list and should be max 2 values.
  534. {user_values_section}
  535. {additional_instructions}
  536. PRODUCT TEXT:
  537. {product_text}
  538. OUTPUT (strict JSON only):
  539. {{
  540. "mandatory": {{
  541. "<attr>": [{{
  542. "value": "<chosen_value>",
  543. "source": "<{source_hint}>",
  544. "reason": "Explanation of why this value was chosen. If user provided a value, explain why you agreed/disagreed with it.",
  545. "original_value": "<user_entered_value_if_provided>",
  546. "decision": "accepted|rejected|not_provided"
  547. }}]
  548. }},
  549. {output_example_additional}
  550. }}
  551. RULES:
  552. - For each mandatory attribute with a user-entered value, include "original_value" and "decision" fields
  553. - "decision" values: "accepted" (used user's value), "rejected" (used different value), "not_provided" (no user value given)
  554. - "reason" must explain your choice, especially when rejecting user input
  555. - For 'multiple' attributes, always give multiple values for those attributes, choose wisely and max 2 values per attribute that are very close.
  556. - Source must be one of: {source_hint}
  557. - Be honest and specific in your reasoning.
  558. - Return ONLY valid JSON
  559. """
  560. payload = {
  561. "model": model,
  562. "messages": [
  563. {"role": "system", "content": "You are a JSON-only extractor and validator. Always provide clear reasoning for your decisions."},
  564. {"role": "user", "content": prompt},
  565. ],
  566. "temperature": 0.3,
  567. "max_tokens": 2000, # Increased for reasoning
  568. }
  569. try:
  570. raw = ProductAttributeService._call_llm(payload)
  571. logger.info("Raw LLM response received")
  572. cleaned = ProductAttributeService._clean_json(raw)
  573. parsed = json.loads(cleaned)
  574. except Exception as exc:
  575. logger.error(f"LLM failed: {exc}")
  576. return {
  577. "mandatory": {
  578. a: [{
  579. "value": "Not Specified",
  580. "source": "llm_error",
  581. "reason": f"LLM processing failed: {str(exc)}"
  582. }] for a in mandatory_attrs
  583. },
  584. "additional": {} if not extract_additional else {},
  585. "error": str(exc)
  586. }
  587. if use_cache and cache_key:
  588. SimpleCache.set(cache_key, parsed)
  589. logger.info(f"CACHE SET {cache_key[:16]}...")
  590. return parsed
  591. @staticmethod
  592. def get_cache_stats() -> Dict:
  593. return {
  594. "global_enabled": is_caching_enabled(),
  595. "result_cache": SimpleCache.get_stats(),
  596. }
  597. @staticmethod
  598. def clear_all_caches():
  599. SimpleCache.clear()
  600. logger.info("All caches cleared")
  601. # IMPORTANT INSTRUCTIONS FOR USER VALUES:
  602. # 1. Compare the user-entered value with what you find in the product text
  603. # 2. Evaluate if the user value is correct, partially correct, or incorrect for this product
  604. # 3. Choose the BEST value (could be user's value, or from allowed list, or inferred)
  605. # 4. Always provide a "reason" field explaining your decision
  606. # 5. DO NOT hallucinate - be honest if user's value seems wrong based on product evidence
  607. # 6. If user's value is not in the allowed list but seems correct, chose the most nearest value from the allowed list with proper reasoning.