services.py 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073
  1. # # ==================== services.py (FINAL PERFECT + FULL WHITELIST + SEMANTIC RECOVERY) ====================
  2. # import json
  3. # import hashlib
  4. # import logging
  5. # import warnings
  6. # import time
  7. # from functools import wraps
  8. # from typing import Dict, List, Optional, Tuple
  9. # import os
  10. # import requests
  11. # from django.conf import settings
  12. # from sentence_transformers import SentenceTransformer, util
  13. # # --------------------------------------------------------------------------- #
  14. # # CACHE CONFIG
  15. # # --------------------------------------------------------------------------- #
  16. # from .cache_config import (
  17. # is_caching_enabled,
  18. # ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  19. # ENABLE_EMBEDDING_CACHE,
  20. # ATTRIBUTE_CACHE_MAX_SIZE,
  21. # EMBEDDING_CACHE_MAX_SIZE,
  22. # )
  23. # logger = logging.getLogger(__name__)
  24. # # --------------------------------------------------------------------------- #
  25. # # ONE-TIME MODEL LOAD
  26. # # --------------------------------------------------------------------------- #
  27. # print("Loading sentence transformer model (semantic recovery)...")
  28. # model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
  29. # os.environ["TOKENIZERS_PARALLELISM"] = "false"
  30. # print("Model loaded")
  31. # # --------------------------------------------------------------------------- #
  32. # # CACHES
  33. # # --------------------------------------------------------------------------- #
  34. # class SimpleCache:
  35. # _cache = {}
  36. # _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  37. # @classmethod
  38. # def get(cls, key: str) -> Optional[Dict]:
  39. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  40. # return cls._cache.get(key)
  41. # @classmethod
  42. # def set(cls, key: str, value: Dict):
  43. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  44. # if len(cls._cache) >= cls._max_size:
  45. # items = list(cls._cache.items())
  46. # cls._cache = dict(items[int(cls._max_size * 0.2):])
  47. # cls._cache[key] = value
  48. # @classmethod
  49. # def clear(cls): cls._cache.clear()
  50. # @classmethod
  51. # def get_stats(cls) -> Dict:
  52. # return {
  53. # "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  54. # "size": len(cls._cache),
  55. # "max_size": cls._max_size,
  56. # "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  57. # }
  58. # class EmbeddingCache:
  59. # _cache = {}
  60. # _max_size = EMBEDDING_CACHE_MAX_SIZE
  61. # _hit = _miss = 0
  62. # @classmethod
  63. # def get_embedding(cls, text: str, model):
  64. # if not ENABLE_EMBEDDING_CACHE:
  65. # with warnings.catch_warnings():
  66. # warnings.simplefilter("ignore")
  67. # return model.encode(text, convert_to_tensor=True, show_progress_bar=False)
  68. # if text in cls._cache:
  69. # cls._hit += 1
  70. # return cls._cache[text]
  71. # cls._miss += 1
  72. # if len(cls._cache) >= cls._max_size:
  73. # items = list(cls._cache.items())
  74. # cls._cache = dict(items[int(cls._max_size * 0.3):])
  75. # with warnings.catch_warnings():
  76. # warnings.simplefilter("ignore")
  77. # emb = model.encode(text, convert_to_tensor=True, show_progress_bar=False)
  78. # cls._cache[text] = emb
  79. # return emb
  80. # @classmethod
  81. # def clear(cls):
  82. # cls._cache.clear()
  83. # cls._hit = cls._miss = 0
  84. # @classmethod
  85. # def get_stats(cls) -> Dict:
  86. # total = cls._hit + cls._miss
  87. # rate = (cls._hit / total * 100) if total else 0
  88. # return {
  89. # "enabled": ENABLE_EMBEDDING_CACHE,
  90. # "size": len(cls._cache),
  91. # "hits": cls._hit,
  92. # "misses": cls._miss,
  93. # "hit_rate_percent": round(rate, 2),
  94. # }
  95. # # --------------------------------------------------------------------------- #
  96. # # RETRY DECORATOR
  97. # # --------------------------------------------------------------------------- #
  98. # def retry(max_attempts=3, delay=1.0):
  99. # def decorator(f):
  100. # @wraps(f)
  101. # def wrapper(*args, **kwargs):
  102. # last_exc = None
  103. # for i in range(max_attempts):
  104. # try:
  105. # return f(*args, **kwargs)
  106. # except Exception as e:
  107. # last_exc = e
  108. # if i < max_attempts - 1:
  109. # wait = delay * (2 ** i)
  110. # logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  111. # time.sleep(wait)
  112. # raise last_exc or RuntimeError("Retry failed")
  113. # return wrapper
  114. # return decorator
  115. # # --------------------------------------------------------------------------- #
  116. # # MAIN SERVICE
  117. # # --------------------------------------------------------------------------- #
  118. # class ProductAttributeService:
  119. # @staticmethod
  120. # def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  121. # parts = []
  122. # source_map = {}
  123. # if title:
  124. # t = str(title).strip()
  125. # parts.append(f"Title: {t}")
  126. # source_map["title"] = t
  127. # if short_desc:
  128. # s = str(short_desc).strip()
  129. # parts.append(f"Description: {s}")
  130. # source_map["short_desc"] = s
  131. # if long_desc:
  132. # l = str(long_desc).strip()
  133. # parts.append(f"Details: {l}")
  134. # source_map["long_desc"] = l
  135. # if ocr_text:
  136. # parts.append(f"OCR Text: {ocr_text}")
  137. # source_map["ocr_text"] = ocr_text
  138. # combined = "\n".join(parts).strip()
  139. # return (combined or "No product information", source_map)
  140. # @staticmethod
  141. # def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str]) -> str:
  142. # payload = {"text": product_text, "attrs": mandatory_attrs, "extra": extract_additional, "multiple": sorted(multiple)}
  143. # return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  144. # @staticmethod
  145. # def _clean_json(text: str) -> str:
  146. # start = text.find("{")
  147. # end = text.rfind("}") + 1
  148. # if start != -1 and end > start:
  149. # text = text[start:end]
  150. # if "```json" in text:
  151. # text = text.split("```json", 1)[1].split("```", 1)[0]
  152. # elif "```" in text:
  153. # text = text.split("```", 1)[1].split("```", 1)[0]
  154. # if text.lstrip().startswith("json"): text = text[4:]
  155. # return text.strip()
  156. # @staticmethod
  157. # def _lexical_evidence(product_text: str, label: str) -> float:
  158. # pt = product_text.lower()
  159. # tokens = [t for t in label.lower().replace("-", " ").split() if t]
  160. # if not tokens: return 0.0
  161. # hits = sum(1 for t in tokens if t in pt)
  162. # return hits / len(tokens)
  163. # @staticmethod
  164. # def _find_source(value: str, source_map: Dict[str, str]) -> str:
  165. # value_lower = value.lower()
  166. # for src_key, text in source_map.items():
  167. # if value_lower in text.lower():
  168. # return src_key
  169. # return "not_found"
  170. # @staticmethod
  171. # def format_visual_attributes(visual_attributes: Dict) -> Dict:
  172. # formatted = {}
  173. # for key, value in visual_attributes.items():
  174. # if isinstance(value, list):
  175. # formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  176. # elif isinstance(value, dict):
  177. # nested = {}
  178. # for sub_key, sub_val in value.items():
  179. # if isinstance(sub_val, list):
  180. # nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  181. # else:
  182. # nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  183. # formatted[key] = nested
  184. # else:
  185. # formatted[key] = [{"value": str(value), "source": "image"}]
  186. # return formatted
  187. # @staticmethod
  188. # @retry(max_attempts=3, delay=1.0)
  189. # def _call_llm(payload: dict) -> str:
  190. # headers = {"Authorization": f"Bearer {settings.GROQ_API_KEY}", "Content-Type": "application/json"}
  191. # resp = requests.post(settings.GROQ_API_URL, headers=headers, json=payload, timeout=30)
  192. # resp.raise_for_status()
  193. # return resp.json()["choices"][0]["message"]["content"]
  194. # @staticmethod
  195. # def extract_attributes(
  196. # product_text: str,
  197. # mandatory_attrs: Dict[str, List[str]],
  198. # source_map: Dict[str, str] = None,
  199. # model: str = None,
  200. # extract_additional: bool = True,
  201. # multiple: Optional[List[str]] = None,
  202. # use_cache: Optional[bool] = None,
  203. # ) -> dict:
  204. # if model is None: model = settings.SUPPORTED_MODELS[0]
  205. # if multiple is None: multiple = []
  206. # if source_map is None: source_map = {}
  207. # if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  208. # if not is_caching_enabled(): use_cache = False
  209. # cache_key = None
  210. # if use_cache:
  211. # cache_key = ProductAttributeService._cache_key(product_text, mandatory_attrs, extract_additional, multiple)
  212. # cached = SimpleCache.get(cache_key)
  213. # if cached:
  214. # logger.info(f"CACHE HIT {cache_key[:16]}...")
  215. # return cached
  216. # # --------------------------- PROMPT WITH FULL WHITELIST ---------------------------
  217. # allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  218. # allowed_text = "\n".join(allowed_lines)
  219. # allowed_sources = list(source_map.keys()) + ["not_found"]
  220. # source_hint = "|".join(allowed_sources)
  221. # multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  222. # # --- Suggested Change to the Prompt ---
  223. # additional_instructions = """
  224. # For the 'additional' section, identify any other important product attributes and their values (e.g., 'Color', 'Material', 'Weight' etc) that are present in the PRODUCT TEXT but not in the Mandatory Attribute list.
  225. # For each additional attribute, use the best available value from the PRODUCT TEXT and specify the 'source'.
  226. # """ if extract_additional else ""
  227. # prompt = f"""
  228. # You are a product-attribute classifier.
  229. # Understand the product text very deeply and see if same products is available somewhere online and then predict then attribute values.
  230. # Do not totally depend on the product text only word by word, try to interpret the meaning and then suggest attributes.
  231. # Pick the *closest meaning* value from the allowed list, even if not an exact word match.
  232. # ALLOWED VALUES (MANDATORY):
  233. # {allowed_text}
  234. # Return multiple values for these attributes: {multiple_text}
  235. # {additional_instructions}
  236. # PRODUCT TEXT:
  237. # {product_text}
  238. # OUTPUT (strict JSON only):
  239. # {{
  240. # "mandatory": {{
  241. # "<attr>": [{ {"value":"<chosen>", "source":"<{source_hint}>"} } ]
  242. # }},
  243. # "additional": {{
  244. # "Additional_Attr_1": [{{"value": "Value 1", "source": "<{source_hint}>"}}],
  245. # "Additional_Attr_2": [{{"value": "Value 2", "source": "<{source_hint}>"}}]
  246. # }}
  247. # }}
  248. # RULES:
  249. # - For 'additional' attributes: Extract other key attributes from the text.
  250. # - Source must be: {source_hint}
  251. # - Return ONLY JSON
  252. # """
  253. # # prompt = f"""
  254. # # You are a product-attribute classifier.
  255. # # Pick **exactly one** value from the list below for each attribute.
  256. # # If nothing matches, return "Not Specified".
  257. # # ALLOWED VALUES:
  258. # # {allowed_text}
  259. # # {multiple_text}
  260. # # PRODUCT TEXT:
  261. # # {product_text}
  262. # # OUTPUT (strict JSON only):
  263. # # {{
  264. # # "mandatory": {{
  265. # # "<attr>": [{{"value":"<chosen>", "source":"<{source_hint}>"}}]
  266. # # }},
  267. # # "additional": {{}}
  268. # # }}
  269. # # RULES:
  270. # # - Pick from allowed values only
  271. # # - If not found: "Not Specified" + "not_found"
  272. # # - Source must be: {source_hint}
  273. # # - Return ONLY JSON
  274. # # """
  275. # payload = {
  276. # "model": model,
  277. # "messages": [
  278. # {"role": "system", "content": "You are a JSON-only extractor."},
  279. # {"role": "user", "content": prompt},
  280. # ],
  281. # "temperature": 0.3,
  282. # "max_tokens": 1200,
  283. # }
  284. # try:
  285. # raw = ProductAttributeService._call_llm(payload)
  286. # print("raw from llm is ")
  287. # print(raw)
  288. # cleaned = ProductAttributeService._clean_json(raw)
  289. # parsed = json.loads(cleaned)
  290. # except Exception as exc:
  291. # logger.error(f"LLM failed: {exc}")
  292. # return {
  293. # "mandatory": {a: [{"value": "Not Specified", "source": "llm_error"}] for a in mandatory_attrs},
  294. # "additional": {} if not extract_additional else {},
  295. # "error": str(exc)
  296. # }
  297. # # --------------------------- VALIDATION + SMART RECOVERY ---------------------------
  298. # pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
  299. # def _sanitize(section: dict, allowed: Dict):
  300. # sanitized = {}
  301. # for attr, items in section.items():
  302. # if attr not in allowed: continue
  303. # chosen = []
  304. # for it in (items if isinstance(items, list) else [items]):
  305. # if not isinstance(it, dict): it = {"value": str(it), "source": "not_found"}
  306. # val = str(it.get("value", "")).strip()
  307. # src = str(it.get("source", "not_found")).lower()
  308. # # --- LLM SAYS "Not Specified" → SMART RECOVERY ---
  309. # if val == "Not Specified":
  310. # # 1. Lexical recovery
  311. # for av in allowed[attr]:
  312. # if ProductAttributeService._lexical_evidence(product_text, av) > 0.6:
  313. # src = ProductAttributeService._find_source(av, source_map)
  314. # chosen.append({"value": av, "source": src})
  315. # break
  316. # else:
  317. # # 2. Semantic recovery
  318. # best_val, best_score = max(
  319. # ((av, float(util.cos_sim(pt_emb, EmbeddingCache.get_embedding(av, model_embedder)).item()))
  320. # for av in allowed[attr]),
  321. # key=lambda x: x[1]
  322. # )
  323. # if best_score > 0.75:
  324. # src = ProductAttributeService._find_source(best_val, source_map)
  325. # chosen.append({"value": best_val, "source": src})
  326. # else:
  327. # chosen.append({"value": "Not Specified", "source": "not_found"})
  328. # continue
  329. # # --- VALIDATE LLM CHOICE ---
  330. # if val not in allowed[attr]: continue
  331. # if ProductAttributeService._lexical_evidence(product_text, val) < 0.2: continue
  332. # if src not in source_map and src != "not_found": src = "not_found"
  333. # chosen.append({"value": val, "source": src})
  334. # sanitized[attr] = chosen or [{"value": "Not Specified", "source": "not_found"}]
  335. # return sanitized
  336. # parsed["mandatory"] = _sanitize(parsed.get("mandatory", {}), mandatory_attrs)
  337. # # --- ADDITIONAL ATTRIBUTES ---
  338. # if extract_additional and "additional" in parsed:
  339. # additional = {}
  340. # for attr, items in parsed["additional"].items():
  341. # good = []
  342. # for it in (items if isinstance(items, list) else [items]):
  343. # if not isinstance(it, dict): it = {"value": str(it), "source": "not_found"}
  344. # val = str(it.get("value", "")).strip()
  345. # src = str(it.get("source", "not_found")).lower()
  346. # if src not in source_map and src != "not_found": src = "not_found"
  347. # if val: good.append({"value": val, "source": src})
  348. # if good: additional[attr] = good
  349. # parsed["additional"] = additional
  350. # else:
  351. # parsed.pop("additional", None)
  352. # if use_cache and cache_key:
  353. # SimpleCache.set(cache_key, parsed)
  354. # logger.info(f"CACHE SET {cache_key[:16]}...")
  355. # return parsed
  356. # @staticmethod
  357. # def get_cache_stats() -> Dict:
  358. # return {
  359. # "global_enabled": is_caching_enabled(),
  360. # "result_cache": SimpleCache.get_stats(),
  361. # "embedding_cache": EmbeddingCache.get_stats(),
  362. # }
  363. # @staticmethod
  364. # def clear_all_caches():
  365. # SimpleCache.clear()
  366. # EmbeddingCache.clear()
  367. # logger.info("All caches cleared")
  368. # # ==================== services.py (LLM ONLY - NO VALIDATION) ====================
  369. # import json
  370. # import hashlib
  371. # import logging
  372. # import time
  373. # from functools import wraps
  374. # from typing import Dict, List, Optional, Tuple
  375. # import requests
  376. # from django.conf import settings
  377. # # --------------------------------------------------------------------------- #
  378. # # CACHE CONFIG
  379. # # --------------------------------------------------------------------------- #
  380. # from .cache_config import (
  381. # is_caching_enabled,
  382. # ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  383. # ATTRIBUTE_CACHE_MAX_SIZE,
  384. # )
  385. # logger = logging.getLogger(__name__)
  386. # # --------------------------------------------------------------------------- #
  387. # # CACHES
  388. # # --------------------------------------------------------------------------- #
  389. # class SimpleCache:
  390. # _cache = {}
  391. # _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  392. # @classmethod
  393. # def get(cls, key: str) -> Optional[Dict]:
  394. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  395. # return cls._cache.get(key)
  396. # @classmethod
  397. # def set(cls, key: str, value: Dict):
  398. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  399. # if len(cls._cache) >= cls._max_size:
  400. # items = list(cls._cache.items())
  401. # cls._cache = dict(items[int(cls._max_size * 0.2):])
  402. # cls._cache[key] = value
  403. # @classmethod
  404. # def clear(cls): cls._cache.clear()
  405. # @classmethod
  406. # def get_stats(cls) -> Dict:
  407. # return {
  408. # "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  409. # "size": len(cls._cache),
  410. # "max_size": cls._max_size,
  411. # "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  412. # }
  413. # # --------------------------------------------------------------------------- #
  414. # # RETRY DECORATOR
  415. # # --------------------------------------------------------------------------- #
  416. # def retry(max_attempts=3, delay=1.0):
  417. # def decorator(f):
  418. # @wraps(f)
  419. # def wrapper(*args, **kwargs):
  420. # last_exc = None
  421. # for i in range(max_attempts):
  422. # try:
  423. # return f(*args, **kwargs)
  424. # except Exception as e:
  425. # last_exc = e
  426. # if i < max_attempts - 1:
  427. # wait = delay * (2 ** i)
  428. # logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  429. # time.sleep(wait)
  430. # raise last_exc or RuntimeError("Retry failed")
  431. # return wrapper
  432. # return decorator
  433. # # --------------------------------------------------------------------------- #
  434. # # MAIN SERVICE
  435. # # --------------------------------------------------------------------------- #
  436. # class ProductAttributeService:
  437. # @staticmethod
  438. # def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  439. # parts = []
  440. # source_map = {}
  441. # if title:
  442. # t = str(title).strip()
  443. # parts.append(f"Title: {t}")
  444. # source_map["title"] = t
  445. # if short_desc:
  446. # s = str(short_desc).strip()
  447. # parts.append(f"Description: {s}")
  448. # source_map["short_desc"] = s
  449. # if long_desc:
  450. # l = str(long_desc).strip()
  451. # parts.append(f"Details: {l}")
  452. # source_map["long_desc"] = l
  453. # if ocr_text:
  454. # parts.append(f"OCR Text: {ocr_text}")
  455. # source_map["ocr_text"] = ocr_text
  456. # combined = "\n".join(parts).strip()
  457. # return (combined or "No product information", source_map)
  458. # @staticmethod
  459. # def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str]) -> str:
  460. # payload = {"text": product_text, "attrs": mandatory_attrs, "extra": extract_additional, "multiple": sorted(multiple)}
  461. # return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  462. # @staticmethod
  463. # def _clean_json(text: str) -> str:
  464. # start = text.find("{")
  465. # end = text.rfind("}") + 1
  466. # if start != -1 and end > start:
  467. # text = text[start:end]
  468. # if "```json" in text:
  469. # text = text.split("```json", 1)[1].split("```", 1)[0]
  470. # elif "```" in text:
  471. # text = text.split("```", 1)[1].split("```", 1)[0]
  472. # if text.lstrip().startswith("json"): text = text[4:]
  473. # return text.strip()
  474. # @staticmethod
  475. # def format_visual_attributes(visual_attributes: Dict) -> Dict:
  476. # formatted = {}
  477. # for key, value in visual_attributes.items():
  478. # if isinstance(value, list):
  479. # formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  480. # elif isinstance(value, dict):
  481. # nested = {}
  482. # for sub_key, sub_val in value.items():
  483. # if isinstance(sub_val, list):
  484. # nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  485. # else:
  486. # nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  487. # formatted[key] = nested
  488. # else:
  489. # formatted[key] = [{"value": str(value), "source": "image"}]
  490. # return formatted
  491. # @staticmethod
  492. # @retry(max_attempts=3, delay=1.0)
  493. # def _call_llm(payload: dict) -> str:
  494. # headers = {"Authorization": f"Bearer {settings.GROQ_API_KEY}", "Content-Type": "application/json"}
  495. # resp = requests.post(settings.GROQ_API_URL, headers=headers, json=payload, timeout=30)
  496. # resp.raise_for_status()
  497. # return resp.json()["choices"][0]["message"]["content"]
  498. # @staticmethod
  499. # def extract_attributes(
  500. # product_text: str,
  501. # mandatory_attrs: Dict[str, List[str]],
  502. # source_map: Dict[str, str] = None,
  503. # model: str = None,
  504. # extract_additional: bool = True,
  505. # multiple: Optional[List[str]] = None,
  506. # use_cache: Optional[bool] = None,
  507. # ) -> dict:
  508. # if model is None: model = settings.SUPPORTED_MODELS[0]
  509. # if multiple is None: multiple = []
  510. # if source_map is None: source_map = {}
  511. # if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  512. # if not is_caching_enabled(): use_cache = False
  513. # cache_key = None
  514. # if use_cache:
  515. # cache_key = ProductAttributeService._cache_key(product_text, mandatory_attrs, extract_additional, multiple)
  516. # cached = SimpleCache.get(cache_key)
  517. # if cached:
  518. # logger.info(f"CACHE HIT {cache_key[:16]}...")
  519. # return cached
  520. # # --------------------------- PROMPT ---------------------------
  521. # allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  522. # allowed_text = "\n".join(allowed_lines)
  523. # allowed_sources = list(source_map.keys()) + ["not_found"]
  524. # source_hint = "|".join(allowed_sources)
  525. # multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  526. # additional_instructions = """
  527. # For the 'additional' section, identify any other important product attributes and their values (e.g., 'Color', 'Material', 'Weight' etc) that are present in the PRODUCT TEXT but not in the Mandatory Attribute list.
  528. # For each additional attribute, use the best available value from the PRODUCT TEXT and specify the 'source'.
  529. # """ if extract_additional else ""
  530. # prompt = f"""
  531. # You are a product-attribute classifier.
  532. # Understand the product text very deeply and see if same products is available somewhere online and then predict the attribute values.
  533. # Do not totally depend on the product text only word by word, try to interpret the meaning and then suggest attributes.
  534. # Pick the *closest meaning* value from the allowed list, even if not an exact word match.
  535. # I want values for all the mandatory attributes.
  536. # If it is not found anywhere then source should be "Infer"
  537. # ALLOWED VALUES (MANDATORY):
  538. # {allowed_text}
  539. # Return multiple values for these attributes: {multiple_text}
  540. # {additional_instructions}
  541. # PRODUCT TEXT:
  542. # {product_text}
  543. # OUTPUT (strict JSON only):
  544. # {{
  545. # "mandatory": {{
  546. # "<attr>": [{{"value":"<chosen>", "source":"<{source_hint}>"}}]
  547. # }},
  548. # "additional": {{
  549. # "Additional_Attr_1": [{{"value": "Value 1", "source": "<{source_hint}>"}}],
  550. # "Additional_Attr_2": [{{"value": "Value 2", "source": "<{source_hint}>"}}]
  551. # }}
  552. # }}
  553. # RULES:
  554. # - For 'additional' attributes: Extract other key attributes from the text.
  555. # - Source must be: {source_hint}
  556. # - Return ONLY JSON
  557. # """
  558. # payload = {
  559. # "model": model,
  560. # "messages": [
  561. # {"role": "system", "content": "You are a JSON-only extractor."},
  562. # {"role": "user", "content": prompt},
  563. # ],
  564. # "temperature": 0.3,
  565. # "max_tokens": 1200,
  566. # }
  567. # try:
  568. # raw = ProductAttributeService._call_llm(payload)
  569. # print("raw from llm is ")
  570. # print(raw)
  571. # cleaned = ProductAttributeService._clean_json(raw)
  572. # parsed = json.loads(cleaned)
  573. # except Exception as exc:
  574. # logger.error(f"LLM failed: {exc}")
  575. # return {
  576. # "mandatory": {a: [{"value": "Not Specified", "source": "llm_error"}] for a in mandatory_attrs},
  577. # "additional": {} if not extract_additional else {},
  578. # "error": str(exc)
  579. # }
  580. # # --------------------------- RETURN LLM RESULT AS-IS ---------------------------
  581. # # No validation, no semantic recovery, no lexical checks
  582. # # Just return whatever the LLM gave us
  583. # if use_cache and cache_key:
  584. # SimpleCache.set(cache_key, parsed)
  585. # logger.info(f"CACHE SET {cache_key[:16]}...")
  586. # return parsed
  587. # @staticmethod
  588. # def get_cache_stats() -> Dict:
  589. # return {
  590. # "global_enabled": is_caching_enabled(),
  591. # "result_cache": SimpleCache.get_stats(),
  592. # }
  593. # @staticmethod
  594. # def clear_all_caches():
  595. # SimpleCache.clear()
  596. # logger.info("All caches cleared")
  597. # ==================== services.py (WITH USER VALUE REASONING) ====================
  598. import json
  599. import hashlib
  600. import logging
  601. import time
  602. from functools import wraps
  603. from typing import Dict, List, Optional, Tuple
  604. import requests
  605. from django.conf import settings
  606. from .cache_config import (
  607. is_caching_enabled,
  608. ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  609. ATTRIBUTE_CACHE_MAX_SIZE,
  610. )
  611. logger = logging.getLogger(__name__)
  612. # --------------------------------------------------------------------------- #
  613. # CACHES
  614. # --------------------------------------------------------------------------- #
  615. class SimpleCache:
  616. _cache = {}
  617. _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  618. @classmethod
  619. def get(cls, key: str) -> Optional[Dict]:
  620. if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  621. return cls._cache.get(key)
  622. @classmethod
  623. def set(cls, key: str, value: Dict):
  624. if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  625. if len(cls._cache) >= cls._max_size:
  626. items = list(cls._cache.items())
  627. cls._cache = dict(items[int(cls._max_size * 0.2):])
  628. cls._cache[key] = value
  629. @classmethod
  630. def clear(cls): cls._cache.clear()
  631. @classmethod
  632. def get_stats(cls) -> Dict:
  633. return {
  634. "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  635. "size": len(cls._cache),
  636. "max_size": cls._max_size,
  637. "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  638. }
  639. # --------------------------------------------------------------------------- #
  640. # RETRY DECORATOR
  641. # --------------------------------------------------------------------------- #
  642. def retry(max_attempts=3, delay=1.0):
  643. def decorator(f):
  644. @wraps(f)
  645. def wrapper(*args, **kwargs):
  646. last_exc = None
  647. for i in range(max_attempts):
  648. try:
  649. return f(*args, **kwargs)
  650. except Exception as e:
  651. last_exc = e
  652. if i < max_attempts - 1:
  653. wait = delay * (2 ** i)
  654. logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  655. time.sleep(wait)
  656. raise last_exc or RuntimeError("Retry failed")
  657. return wrapper
  658. return decorator
  659. # --------------------------------------------------------------------------- #
  660. # MAIN SERVICE
  661. # --------------------------------------------------------------------------- #
  662. class ProductAttributeService:
  663. @staticmethod
  664. def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  665. parts = []
  666. source_map = {}
  667. if title:
  668. t = str(title).strip()
  669. parts.append(f"Title: {t}")
  670. source_map["title"] = t
  671. if short_desc:
  672. s = str(short_desc).strip()
  673. parts.append(f"Description: {s}")
  674. source_map["short_desc"] = s
  675. if long_desc:
  676. l = str(long_desc).strip()
  677. parts.append(f"Details: {l}")
  678. source_map["long_desc"] = l
  679. if ocr_text:
  680. parts.append(f"OCR Text: {ocr_text}")
  681. source_map["ocr_text"] = ocr_text
  682. combined = "\n".join(parts).strip()
  683. return (combined or "No product information", source_map)
  684. @staticmethod
  685. def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str], user_values: Dict = None) -> str:
  686. payload = {
  687. "text": product_text,
  688. "attrs": mandatory_attrs,
  689. "extra": extract_additional,
  690. "multiple": sorted(multiple),
  691. "user_values": user_values or {}
  692. }
  693. return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  694. @staticmethod
  695. def _clean_json(text: str) -> str:
  696. start = text.find("{")
  697. end = text.rfind("}") + 1
  698. if start != -1 and end > start:
  699. text = text[start:end]
  700. if "```json" in text:
  701. text = text.split("```json", 1)[1].split("```", 1)[0]
  702. elif "```" in text:
  703. text = text.split("```", 1)[1].split("```", 1)[0]
  704. if text.lstrip().startswith("json"): text = text[4:]
  705. return text.strip()
  706. @staticmethod
  707. def format_visual_attributes(visual_attributes: Dict) -> Dict:
  708. formatted = {}
  709. for key, value in visual_attributes.items():
  710. if isinstance(value, list):
  711. formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  712. elif isinstance(value, dict):
  713. nested = {}
  714. for sub_key, sub_val in value.items():
  715. if isinstance(sub_val, list):
  716. nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  717. else:
  718. nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  719. formatted[key] = nested
  720. else:
  721. formatted[key] = [{"value": str(value), "source": "image"}]
  722. return formatted
  723. @staticmethod
  724. @retry(max_attempts=3, delay=1.0)
  725. def _call_llm(payload: dict) -> str:
  726. headers = {"Authorization": f"Bearer {settings.GROQ_API_KEY}", "Content-Type": "application/json"}
  727. resp = requests.post(settings.GROQ_API_URL, headers=headers, json=payload, timeout=30)
  728. resp.raise_for_status()
  729. return resp.json()["choices"][0]["message"]["content"]
  730. @staticmethod
  731. def extract_attributes(
  732. product_text: str,
  733. mandatory_attrs: Dict[str, List[str]],
  734. source_map: Dict[str, str] = None,
  735. model: str = None,
  736. extract_additional: bool = True,
  737. multiple: Optional[List[str]] = None,
  738. use_cache: Optional[bool] = None,
  739. user_entered_values: Optional[Dict[str, str]] = None, # NEW PARAMETER
  740. ) -> dict:
  741. if model is None: model = settings.SUPPORTED_MODELS[0]
  742. if multiple is None: multiple = []
  743. if source_map is None: source_map = {}
  744. if user_entered_values is None: user_entered_values = {}
  745. if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  746. if not is_caching_enabled(): use_cache = False
  747. cache_key = None
  748. if use_cache:
  749. cache_key = ProductAttributeService._cache_key(
  750. product_text, mandatory_attrs, extract_additional, multiple, user_entered_values
  751. )
  752. cached = SimpleCache.get(cache_key)
  753. if cached:
  754. logger.info(f"CACHE HIT {cache_key[:16]}...")
  755. return cached
  756. # --------------------------- BUILD USER VALUES SECTION ---------------------------
  757. user_values_section = ""
  758. if user_entered_values:
  759. user_lines = []
  760. for attr, value in user_entered_values.items():
  761. user_lines.append(f" - {attr}: {value}")
  762. user_values_section = f"""
  763. USER MANUALLY ENTERED VALUES:
  764. {chr(10).join(user_lines)}
  765. IMPORTANT INSTRUCTIONS FOR USER VALUES:
  766. 1. Compare the user-entered value with what you find in the product text
  767. 2. Evaluate if the user value is correct, partially correct, or incorrect for this product
  768. 3. Choose the BEST value (could be user's value, or from allowed list, or inferred)
  769. 4. Always provide a "reason" field explaining your decision
  770. 5. DO NOT hallucinate - be honest if user's value seems wrong based on product evidence
  771. 6. If user's value is not in the allowed list but seems correct, chose the most nearest value from the allowed list with proper reasoning.
  772. """
  773. # --------------------------- PROMPT ---------------------------
  774. allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  775. allowed_text = "\n".join(allowed_lines)
  776. allowed_sources = list(source_map.keys()) + ["title", "description", "inferred"]
  777. source_hint = "|".join(allowed_sources)
  778. multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  779. print("Multiple text for attr: ")
  780. print(multiple_text)
  781. additional_instructions = """
  782. For the 'additional' section, identify any other important product attributes and their values (e.g., 'Color', 'Material', 'Weight' etc) that are present in the PRODUCT TEXT but not in the Mandatory Attribute list.
  783. For each additional attribute, use the best available value from the PRODUCT TEXT and specify the 'source'.
  784. """ if extract_additional else ""
  785. prompt = f"""
  786. You are a product-attribute classifier and validator.
  787. Understand the product text very deeply. If the same product is available somewhere online, use that knowledge to predict accurate attribute values.
  788. Do not depend only on word-by-word matching from the product text - interpret the meaning and suggest attributes intelligently.
  789. Pick the *closest meaning* value from the allowed list, even if not an exact word match.
  790. I want values for all mandatory attributes.
  791. If a value is not found anywhere, the source should be "inferred".
  792. Note: Source means from where you have concluded the result. Choose one of these value <{source_hint}>
  793. ALLOWED VALUES (MANDATORY):
  794. {allowed_text}
  795. Note: Always return multiple values for these attributes: {multiple_text}. These values must be most possible values from the list and should be max 2 values.
  796. {user_values_section}
  797. {additional_instructions}
  798. PRODUCT TEXT:
  799. {product_text}
  800. OUTPUT (strict JSON only):
  801. {{
  802. "mandatory": {{
  803. "<attr>": [{{
  804. "value": "<chosen_value>",
  805. "source": "<{source_hint}>",
  806. "reason": "Explanation of why this value was chosen. If user provided a value, explain why you agreed/disagreed with it.",
  807. "original_value": "<user_entered_value_if_provided>",
  808. "decision": "accepted|rejected"
  809. }}]
  810. }},
  811. "additional": {{
  812. "Additional_Attr_1": [{{
  813. "value": "Value 1",
  814. "source": "<{source_hint}>",
  815. "reason": "Why this attribute and value were identified"
  816. }}]
  817. }}
  818. }}
  819. In case of multiple values for these attributes: {multiple_text} json should be
  820. OUTPUT (strict JSON only):
  821. {{
  822. "mandatory": {{
  823. "<attr1>": [{{
  824. "value": "<chosen_value>",
  825. "source": "<{source_hint}>",
  826. "reason": "Explanation of why this value was chosen. If user provided a value, explain why you agreed/disagreed with it.",
  827. "original_value": "<user_entered_value_if_provided>",
  828. "decision": "accepted|rejected"
  829. }}],
  830. "<attr1>": [{{
  831. "value": "<chosen_value>",
  832. "source": "<{source_hint}>",
  833. "reason": "Explanation of why this value was chosen. If user provided a value, explain why you agreed/disagreed with it.",
  834. "original_value": "<user_entered_value_if_provided>",
  835. "decision": "accepted|rejected"
  836. }}]
  837. }},
  838. "additional": {{
  839. "Additional_Attr_1": [{{
  840. "value": "Value 1",
  841. "source": "<{source_hint}>",
  842. "reason": "Why this attribute and value were identified"
  843. }}]
  844. }}
  845. }}
  846. RULES:
  847. - For each mandatory attribute with a user-entered value, include "original_value" and "decision" fields
  848. - "decision" values: "accepted" (used user's value), "rejected" (used different value), "not_provided" (no user value given)
  849. - "reason" must explain your choice, especially when rejecting user input
  850. - For 'additional' attributes: Extract other key attributes from the text
  851. - Source must be one of: {source_hint}
  852. - Be honest and specific in your reasoning
  853. - Return ONLY valid JSON
  854. """
  855. payload = {
  856. "model": model,
  857. "messages": [
  858. {"role": "system", "content": "You are a JSON-only extractor and validator. Always provide clear reasoning for your decisions."},
  859. {"role": "user", "content": prompt},
  860. ],
  861. "temperature": 0.3,
  862. "max_tokens": 2000, # Increased for reasoning
  863. }
  864. try:
  865. raw = ProductAttributeService._call_llm(payload)
  866. logger.info("Raw LLM response received")
  867. print(raw)
  868. cleaned = ProductAttributeService._clean_json(raw)
  869. parsed = json.loads(cleaned)
  870. except Exception as exc:
  871. logger.error(f"LLM failed: {exc}")
  872. return {
  873. "mandatory": {
  874. a: [{
  875. "value": "Not Specified",
  876. "source": "llm_error",
  877. "reason": f"LLM processing failed: {str(exc)}"
  878. }] for a in mandatory_attrs
  879. },
  880. "additional": {} if not extract_additional else {},
  881. "error": str(exc)
  882. }
  883. if use_cache and cache_key:
  884. SimpleCache.set(cache_key, parsed)
  885. logger.info(f"CACHE SET {cache_key[:16]}...")
  886. return parsed
  887. @staticmethod
  888. def get_cache_stats() -> Dict:
  889. return {
  890. "global_enabled": is_caching_enabled(),
  891. "result_cache": SimpleCache.get_stats(),
  892. }
  893. @staticmethod
  894. def clear_all_caches():
  895. SimpleCache.clear()
  896. logger.info("All caches cleared")