From 0cda45a45f494b68fe6e3ba7c67a53d50619f856 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Fri, 17 Jan 2025 02:19:07 +0000 Subject: [PATCH 1/2] feat: support image understanding from html --- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 57540 -> 59019 bytes .../lambda/job/dep/llm_bot_dep/figure_llm.py | 155 ++++++++++++++---- .../job/dep/llm_bot_dep/loaders/docx.py | 16 -- .../job/dep/llm_bot_dep/loaders/html.py | 1 - 4 files changed, 123 insertions(+), 49 deletions(-) diff --git a/source/lambda/job/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/source/lambda/job/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index a8b123fd7c2e193e66675b73b612bd66b0adcadc..e641f2ba1e9b63547bb2ae08d6ef1f122c8b8d94 100644 GIT binary patch delta 10144 zcmZvC1yCJJxAnnYgF|q4cPF?*g1ftOa1IhY=)v7daCd^cySoQ>2=aXwJ_rIx{ zn$_KF@2TC>)qC}>>Fb7y8i%STp+#lKVAC1rFxBJ2002tZz)Z9xpbi3C*SUqe=eI~g zHii1}N;_6Fg4mQ8Q)IhAYGS+Jc)K=x^Xm7^GiwElO;*%|C$8rp&Gm8^T4C_9VyVXxKz=2angs zc1J$f%Xfq*gSJD9pyJuWX`84}m#)|xNctS|6}$P5JpPVwl_!h@5U*Ytp{jYrkBTO` z7|{TH4BI_{9DD6)%z&X`e9!$4kY7d-d+@LZmVCr!bZ;p@l*Uo2D13rS^DhAeZbeeM z@w+~GpE$Pp=c1^CyU-q2(tagomT_0WZsqW@`P)ng`duIGfqpEvT!E#>)6^XmCdc9t zr#UO8IxoH654_)dM_=1=gxkEFE&a;co>n)4tKP2`U{Y7!9?#AfG%h;Ys*9VPe|Vg% zo_Vl%@Lcrr3SO+e`W_DBzLtSbJ6-)&5GR)3ysAKl&KFl)0p||uCk{W+oCHn`?;PKL z=fsLI6S2y?gVr5=F6@wZAOd>Cg58>gmo{4KZw`?eX6vb6c>nE5H_o z02}JWR7;SxF_1#r#gO8$a6#q62d&$2eSY6Bg2blRvexDuERU|?lx@0Ot&^ zpWg@XJ7S`_hg<0JK5&nlbGOlQd%BC}eX%B^oHA-2&p1zdeW{H^qHo zc^XO9x;luHy-z^DHQv!4nwODJ5s;65TXu}5pkJLuZ#0y>W_XC!>5*P`K9t$&l}3mg za>-oWADWFhN>>1(39!)^6jS;l&m8pEync&afEH-jBstT@%Z#Y4*mX(r?Kj^budq{% zu@-E8;!Y=hSKs1R3&;wz=|Ya1(MB~19IAFt~tiK9<)0#@tglwQXs_ed4VpGLiXwV{4g}o zaLdeP8iD1?WMsnTH^^LY7F<$1ST@6y0pi$orRnm1ndURsnBFWUGc(g7@JzQGB)NyR9LLNBsJCK@Egs=i;(w)w*|d7)F4B)cSe4RE=7jO|@x( z$294px!HFLcZ|?^#X-3Y(+-qsQ%n~EN1JB zwa*aP*<8JMOu7%cbF;3f?0CRj?5N@MkB-YC(u#}rwXZ3nWYL)0cvl$T`>v;^c}cId@67N#OXTbFn68- z3hQc@LI4oTdw>(XpVDV2C~vEZ7=|8)IvQ1Al=e9$fGo&<=`os-1IQ6a3qar_)E4^+ z-JJpESCJ%T4#&b8k?B$Jz9&Y%>ZK!SfMTfwb@galjCBwNNro|1gUm+y>G3}0v6qU{ ze8$EUxuBTmM+A4()UO5T+}n^VGY?`CN$|$2Nby31Pw7VoeguSkh}1AF;OD?-Y{M!c zN}+RC7qhv7bkgPQ5bRgU@5J<;G>4XtBV*KeH9$Tc6x2WH^#yuY;*;{O9U@Luh{lMki zb2$sGJnX9%GFB2^QM8fz6LPa?g?3sLEl`6X)3RnG%4qndi&El;v33L=UXGJq!;~W& z7m?h-W3f}Mto!45Uev&r&13VWSjq*K^DD8e{M;MYxb-dE1Ky znhrq7mTNqUCP*@Q2r zzIjgK1X)s%(;P>_V53#+nWf)Yr-ocuV906wIMP>Sf@z>w$sXwCPo@^x7he+{-y5w3 zet;>n785_iRqfIt6VoR&rTUGr{`}| zyOwLga{Z3mO9jjB>0+K|EAm8_CQ1_<%DO9h6~S^%8Ky(6Y-!o8<7O=v%_xK1x_&58 z47%^KxaQp9{WTa7-k+f!e9tK$avT1J$g4 z6CNb{{nhh^Zg1eHdLW)&`k}P5VX`VB_LF6%X22?*8F9WbmoXv6f-`FA2DG|##R#~a zK|5f)Ml!lc&lf%3yiJuirikstaY%9*TA`*bNl--Q>$g){y~~9vvHr^C zbMsNkcwf7(q!25rV5r2L>Q^ArD5$pYzBj&nJ!QzG4HzS}6XHHt!$r5zn|$&~CaEaF zWU59}ii}ZG-1%81Al)cA1e>amcpT@tX|D-t1JU6Aqh0q?p|w|8E-x+z7VJ0MG3ZiE zLnp#)kO7N{!o*IpPW=s#_-V2uOpCd&FUk{vu${+{rOiAR<+uAJQ2>*rjXQG(PRSH&* zuj@=wf3J9v=lXYAY{#u};8y;6jAHpt2C&YG;58jiLht<2NA8M$L(%kbRb{#>o(Djcg15XvM_# z;nEtpy$AM3GJLoxwtNc*G#y(I8zsy}C6BXDhuP6H}9M5!E8@a--9Z#;$B4n6|$574kH3W1l z`K>YK4Dkj9tVUTJJGH~)nF#v?Vov9u@?0ePk&(4j&+}_*nrdwKzL-?%F#~ZnA?l$H zrYO@q4S)|HdX?JIfuQ&OGblkwwn5m6@MlIbY&9=mjK_o+GxE07Fbp=iC0exu)K5}6 zFMXKFDyekgkFD zatN#MiUfpNHO*lamr*c>ijt%5&N%{73514AsR3IBFF>9wP=qe^i*m2$)Li0I4*=S+8%_qLwa1?-F?gk3nwmP?7)ack)+RysY_mg`uZ z0y~6Aikxl>As@Q8V6v{yvzOfWJA@t?Wfi4#_nT`*y+OCYI*MgO*tu1M=p=NzqW}oH z4olZLHgN}eh>F<&ecM98!a<=uvEZRws&8o~8#YN=5dDIoB?%O;7%#nuHuK5Q-Ul$n z#l_3{60mqmh`6TN;~%-Zw%NXf24_aJE$LM?=@KPet5p`knf;(t%*=cD(ejKbIkX&$ zNgIbJ8w6p0rGslnyX0P}gz4W!e3OH^p~vFo)~wS&uM8q3mGrdUaG}7F#+f5mb+(t& z)bu$XYZrd6<3K{%WXRU$E$DT1-cH&{BQLqoF*e(|1BD=ueoZhG8r}@cW!-8{4#@aE z;9ns@MqX=}b(D{UysVL1LYq=38cF4n`+Z~#u?rOC9E6;Q(a5MwN4PuoX@ZAz-s%`` z9-BR@>$mAoej$I~nQuZPLCILnFK}|@zj|`pWQCsY2gTt zo`(>o%0`v67liSf?mXpTC2;a1j#?YBBrB%#$P}8c*!m7l3^Lf|*`(Db8N4u8RYZI~ zFZ}}1G_53Kfz2qG(-tV_l^QQKL?Xt%aG@QtXq02U9WslJ^BH8|u&;CnWGd3^d*gAW zDW%p_9Q9H;SvCekC7?&7GltdR535m`L;5G*VDh5h}Irh&~0P37i zM6nII#WFkB?}l|NTyPSg5 zoxCbO4?C=~qIWzq!0(eoMI2EDt^^I1*F2+kRwc;@WC~5dnAfUO>rl`H~$qCL>w|_jdaLVueBc>FwCfmIBGA7`pF6 zUJYY?(>_?VqviGK0-kInMsdi?_#S2bzp{~--n3i~C82bPx~`^28l2!nY1w#(Y^}eJ zB$u5D+2Lk{nRIBgXhuEvw{AEPH=64M%0JyzFZnK)kDA{va3esJEe9}B(4xMo?g%Ri zBto;dWC3Vyo_0asri0Jd2(Ma;}@}P1M4&eq7mP zr}Vb(T6bZ4PfX$?6B@VDC&ot-&YG+wLq}4Qhhk(7=tk)Fv_>h*nm-*T_~!O==6s3x z6O6+c({r8Rc|$Ppcrok_9ee^AA8!UJQ$cJ0S!!X*Oqc`&QMh<_IL4o>mzvWhf>ea4 zN{pv-|GbE{fDnRX3ZVZ!AbRaK4%A1GA#T(2%EzcLR+UCR6_WA5yP;0Z(G}rChtG1U z_;9{9i%Zp|W2x)(A<0w&d7`a(o%ZRpWJ;w2)jWGR>9@+_b$tP3Q#qt0;mUL&@j(uQ zxik&NO#BN7$@)E#G3hQ&iAIyh7Lom{#lob%c+{!SL!>u)c6Z^rhG%OgkHTu~_sq~r zRXxZ}T@*pAZlkSQ!5l~ydzdv{Y!mEBL--uE>vP)=7iCS|NnJ@(l2nfwOr9`InsSFB z{h6oQ?*&69ISk!2?;RKjH!LdhP|#R^RvA2Cl{qCI1*PuFGZF#y>`&cqtHxO(9R>hU zMFW#W5P-}aO+EiCy(~>#$5n1D|HUfO&(L5*O|K(jSSaEt3)=*;vmov+R%jS``TB7w zOs)i*U-(ZK0%5qa^_fDS(Hdp^5B;~}t@wW+|JVmItk{#Usr|ZKc*vLImg6oo2_>9% z+EMOV$pVg@+SH(iHC4O8S!_b=R)Z+&wc|ttC_rYTzbIv_wPuGzRfn&Ay56Zre{NTh zDS#bFG*C6`fV$_YR!F&1QuHw_sQ#b)gSyUVb!sCS^vlzTk$sRZcz~PWc@%wl35E@9 zh)xd0kC(VY@Q#&6hkW$w^F4T#R*s)4pw@kgiRqt~44eyFcQ?mrVqJ+GsvIG2Ta9(I zDM5Htl|%Gj%a&YRO&{3hbQhfDZYX6((yhuaw)YJghuhi_ItE2?Dk%P3Larse1MZ|wth2(J(K3W+#;?&U8C?$U)J+U^gocCV)f-QlxcH=SSzY2jxY12^=T3V}tvJs(n(1q%inCpL7>Gk_tU3 zI=4qBlF5zjCArnasd{qAGm`|IP&JIUa2CVA$R<+~?k5P0t&k#OPL>MHTX=%Rg!6sR zi%udug{aYv^13nN;D4SYb_&iwU4s@#ur>(tWErBz)A0x%feJMhChMqtgMIx@#fSBe z$cHcl-)jZEln5`TNM;8x-oJCFGB0WSCX_#42NJHb>W{ay6EE<2-IaCxIF}Fc0FpG7USb)>pA zIjQ~j+4rpR7y)w=*o8$HS_M+8SJyT9DLsrp3|v@>oJE$qtOqtquAGSj+s9pqq=`=M zHZDI5(-NE-E>a9bXH8BOkWbMH8~p5FYHZ)r;RyGFbywN(CFj|@?=9-Y!nWlq6u9kw z%_KXJs$(KJ%GQ$9MpW1=jB{?X_HT8`?2*}{u(O`|P!17PIU@BAwFwe&0Cg%@p1%$$ zR;^6`X#6}S)qHG>^<9NWu)Wm>^dX*J+Eb>1K95^U75uwepV*DK%@1c&v)~-~e)O~2 z%xi2MA2>#pEoH}P3cpmf@Iw)i(ImrRl){HoiKUG72$n{jf-WXuS+t2%HKYPEuuT7P zRn5TFR!lF-i->-YU;xA&cY;Oxv!*pxeHEoXfH&KuZmL8>buqH&GdE;RSXOl1kG1c! zO6jNHq}h&Bx82>^lwrHN?V=RFt7FCZrG+!8##E#2K;Mr^ya4b zQ{U_pgqh$I?HE|X?mR^s7^2La>-#3afD{vYolB+{tfJE^6z=?B^?)yp!rJA1XA!$> zeKidF&!)bM&Zg5`#}jA&M@$rhi6f-ZF?am}_oTj{yg#50?+Y{Xf!!l$K^N>bfl@CR zB2ZOej;GYbXYoR~l6KXL_8peYd7VOcpy5K&>bO=-w1)zgs@i~DN$*c5-*$?DYF9fmqR$U|?#4o;W zu!WYg=W(TAL5(C1K20rcf}DEWx89GQrC8tgTVk^ePf2~A|dPZ?d=`k>0O5^B;Q4}3F*4L)Xb|dH;q=MMpHGq6G`Ig zD+g%f=6Vq6&4^;D;1G>q-TwUAuF%{HFfn3R3cyA{&vph|NRlJDfiij^YHY^b%Pdc+ zaT=L649xKDc(d`{3bqhfVURNe`>m0daqvdQafwf?M5k*XWPTa=qRQJ*g5jiX1= zuv(#Gp-W>t2C_n?@SE7)CJN1TeD{mZrb!Kx=-~K=6Fym~;xH0U_3ma5GleFb#8*TG zd@e7o4Gxk0lf8O{~;}+ zsk}z$z1hI`_oHpwv3-05AmV3YK77YmgCZ|M!^%>a7M0(3UeGVon?Z8}W2`*N6iXEL zPrEsXW+uQ30SWWF4(GyOiZU{XPqe&eXLrwK{awYZFpV5*2l6WRSKW(R(gL=v4tP}wO_6XvM6dj$_n4Eq{AJUq3Q3WL49jQeHjw7urcUdnHF$wJ&H5)l+l_bdHbiMHcb-P3P+BE%72 z$w+-LR&TYeEMBYVzKy5MPLq29FFZ@8BX1V&Li}8Q`ujt5xc5nQZ1ig^4-2Y>$F0Sl z%@vOq_LWc5TUV7>dxwgE8<8aZuUWiCmzy&5C5my_YN&=xTMMb-(+DjpjurTgL24<+ z<`3!1(LJDUEzr4B3>pl7C>oNw3Q5iQZ=)e)BSE01wtAyMJzmnP4kitiZQ1n@S_jjM z&E5fBAGroVXuOI;;DxR z$jyj+g+s&+Il;pSC9#pGo4(;!za?011LXu|h4qP;gkq4)1#+i>xa9Nj=sQ65wn_&L zkw>(@8wuEMjR4H)g;lp79Yzkd!pB_aSiObt4^V5B`0x)%9e04l(8grbp?RaCM1%kU zRQ?=bD8Yu)B;e{zc#!>6i$fnNI_%bl@-{mip7WBA>}t^%nR(G|c{iI5at!|H)hgVA zT;vw!_@KyOl++JOx$C?^qjrHLBCRWWA3yqvv*CFPRd3q25iGo^9et3Z#ay6ZT7atD z!?N2<^YylGF{#n}V~$=!=2Sfx#h;*bkKTyL6L#=%4gxCP74&^4hglnz=u`IbbW{VQ z4KFeK7Tj5(cGwP_5k6jWlMHjpp;>b{cy`Qa%nF$~f{6R@Xx~%aA6TrvVDPQmLE^6C zUEi{4n%2PSGr@=Nra45&NWcw@{frB)5PMR~tK?G17+s{YMjoYD^f94e^$Gj}cUTEM zL|J|#6Q5j_ePZJQ9g$7xJd8ev{(q+QzXkW-`N}I|I|pk_YeP3qaZH2jrn#a1^ny>P z)lmPAo+!P_Cj61$EQnx*89GqXaAlbmrsUxhlM{&_#}EW8^1v;g$A?d@N6179rq!Lo zEpqFam6jdpbLf;Br{NtG-%I(!x6mnbU$LPj)se+6=Y`{i(Gb#~GEAtJt>u;0J;?}& zfOiN*To~U_ogBIB2`YfC^k(PLx+L9ir5e@f_uaf$Upn(!0_2t#cgsP0BOD$rg@r7h zIE{^a88t((Pa}&7<9RSG{8BHG1)OseaWh${}hB z=85k7I|wF2L;P)>d#*K%7xRK}&M@;IP9DDda%i%3(q%Ly z`skVDn}o26#N2F;_TMb6bOy$(+74C2cw(|p4~uTjhOeu>efXq26D~2DBT+SkPE@kw zVLC%jD1gpbr6Du3j6o~m$PruPfF|IzP3hNHw5SMBxDT+xa{2*kuiG1TMb9*HenCOd zow_AG-4XN*@C?wuyqGNotPn>;IsK$LJed=5OKSPndNec-dztv!ZrmjBgFnP8jv!n` zaFo>%=J)R=vN)SD17uk=y8^|8+1_jQM-!HBDr+P=@MoM}sp5@QzN4Z}$z~fJ7ptXs zuv4$0Unch;F09BvSz7n!_$(Xsvn6ATsTI~lX*Jo5+1S2zsnMNy1KVo?z+<5?e~+!>euiuv>rr!ni#$?7_-@x6 zU6x-|a<;)iE%xC$tjQAfR(0oU&WmXQCsrUN7>=igOb@1mo`+zb|l z=NZPOO1Gw>DKcTVV`dp;7`X<-3$>L?0ZTM$!zeg` z)vuJ<(6pg$q?`)F&S1OGdeDgP@wx(E`)VdXRP>%FcUdzv#$*Ed^=7^Hkj6(DG{fd- ztaJaz?;;^h_dx91Ol$a>(J>VQnnIv{C14+G$s25s{@!- z>F%JlKC?cu)BHOM7r>3T*fySu$t|)ASxdssfz{dRRngPH`NjpuhlZM7zQLX)r)d8c zg)z&U!&1Yr+eEN=ZXGh>EG_!9PUZ*#r$ytqfS>y17o9!~ho?EA9`pqB4fn>6dijs8 zkGu4z&n8c#M3GcTy@D@T%gXDY*E{TbZv{Y`AD*qyu;RdBNZ*3-nM`Y@`)A0Qf!Sop z-McxFbGc=`X1(4CsLUnvByBT-J|LsSviub8TV}1C@NkAA&7qN-)>yTi4+)xPSxJz z$T*c>;{DXjwcnNRsiz1$T?;5#{0=UE?k@1L-S14i&d0!ysY zLvn%swnA3fAO{2gqKQ>Z(tnqH006>&*~f(^tkIGF>okG%zd5Ece|;IQu|Xn$|Dx|} z?2vbUy2(~W<` z1=wqY5VE-eY}|?sF593Y{i8s1N+VXL5dIWL+4%v0e+Qb;`G-jMC(Hh44*wW{NjC{0 zw`cwg5rMTg@k#$bsq(L2)#kxQ)5L$23)Vlxzd2g(58>}k_Ag@g!XF|9%=x-Id=&O)7i=ZvTMR)yHZ9ws@7_`906+yDl=Uqoq>jYeaa!u3 z1=6dA7NBEAM;qT{YG|W`#O>N@{9eE@V}1JV+!A&yoq8N~Q@WXlj`-W4?%LYc#9$?@ zlkym+!g++a7)r-PT;Xf>UzbPa`3c8Z9&HceEXOC8FYQsPWlegb^ypa>3t3fnX**I1 zW?u8(GOZhws?uez4jv|<5NR%x@iLrgBy)XLM(o2$PkI5q-v~A$BH__=^LZp0jFVdW zO;MD7EDV zr1e_vyi+pQYuS%(yW}*^U)Eb^cg!Tpt4;}47X~(#6;NvNioCcVyO?H4X;BJeva`8v z!uWp5l1eR^=T&F7^7a(gd5E)pYISM4M^mE+yc7Y7y15It-mn9fb)IXTPXo@N!)4=% z+Ul89ULu_pwpMx&2)B82K|sK}yN0^P%KJP1?=w10fn?t86TceQ9+s63!$UTPpl4Ge zsfXcZf+B&K^F1xYY4b#^qMp|WbA1=hb(R$w=>bW* zHPLCz>UO7t)>t@h@YVXkWd&JU;R=4X25ogaqYS)?bG$Sybrh{@4;R#~X&5P;e&yb{ zY&xPpU6XnsCF)wdgk9A-)Mva;kE-RfKeG|JaaO%05LP@COAz%xpK~bJeXv^CJ$Rby zyFMFhOQvZa8N#>2bxN5XpKLhw3AqD}aDp7^=8^Y5O$^vk4*DAFli0!Lj~!9yHNwZY zU=uiH`CM)GVTR%wy>)cEo5`t)oji|)GE7Ydl{-5^el%V9I6e=}b%>ZhU%k6^!y+hM zXm(}A0L#C_0P|yk)op`N$BAsS09F`Qi5F%4m|vbQ56dD2^^ai)jF>OWw1=~|!IZny z^Ue2UPq2%P9F)TDVJd7HhzQ-bV+|)Ok;q^8w zHmt0W99?3f7%SnHp03=1moggM1@EAN?P6Tb&Dn?*o8N0g?opW04VItOWtktInc*J6D_4G?eqeo)C%5q-$ zs#${ZMEryU5}+Qwit&nN?*oji&?~7xopI-ne*C3gc2W4UksZZhYtb^G2qa zdx*hC7?r=wra+Ui9HD2T*cg1o_(~cjfo7G)4zS-n$D4&+9=0>FW(_rjD zdJVEOyYasb79gmYgGXJ^m7OhTKDXS#?GGqdQ%U|T-AQ-;{=;Tjwa*443_>T(C-qcl zVp*6iVNFUD`acxhq8kl#V})??g0tZ~6yhW{W1s{LTbF zR&}AstV_uyp;>J3lM`i2;SV-0Y1hD9OUoB8);r>l@{ZB;Ad5!D>sEeJRD9tO@KPCU z*MV!#mZMl0pI0iH5FaOT$tYedHC3vwmTUxb=R+G$A!H=exeg*o-;+ zZ8j&5K2KSkR35a1+h8OeX{;SFu*SuYsW53tex$d!^HZQte+>PF<6csB*tc%K&)N+W zUMQV%?lz-LVrZn!(A~#NtY8fE3y61{_oL_I!|F)Z3hzY8&rT@Y&mbhV8ojdn19@=x zhyC?B;q8xwhOuP6vXJ)MNUzs+o83|pE)M|q$sM4k<8~BwWen-o4|mciNtx)bSz)at zAY)+>S;u1D{wdL|P5PN60zdh%xjS9D67_^0tZ9eGjIJV`PWZlTaqn_Be>pgtDA7QH8kn({pe`LRvh8VX>))p^_z*b zYK&LX%4>+O9 zw1|gRUo8D3lU{jStVzcj_4b+Zv(nt*!B^eK5}T9RNlhm{l~$-Q$sjU+$0o6A;w`Gh zm;>!nJ{Q83S?xqYN{UPm9wbFtCNkXEwaF8E27^gZKwj{k}h3Wy^|blT>_L$WjUf0<`;H4(jWk- zNY9+Lr2Cx|;d1LMn@8s?hiUcz2QExnSKrtkK6G?KR7Qt6w<8}uu>2= zeiZ1HXogi8GCw}~ZU?ESv(LV@>M`P9(K)9d zP}k+4-w1b;7KQ`u2CjT2Aap2U`7mDT8rrdd>9dT=5@?aP-_HZ_*a4FG%RM4`im%`>hyaEdkh2;pKOd0*Zj3}DZ{ zVFS1a(qLpIJ4zL29cyRiO@!-)$803WxlBdms0*9G4Vg=mf%?wWBp4 z#ZI)xEL??C7S9}lwlnkl-m*hcESv6VXg)&s4*5uG9wYY^Jz?qHm%+S9p}Ch+fDJcD z%v<8c*v9pFV-kENuHSjgP>IVbTz>>}aT^RpNcZg_57{uz+U?`=v#>P%VSxonD4|PI zNW+)S6-dRRny-o`Ga~`3qbXoWi3+VD+|3+$SCjAa`@_#=)DUGG@+WwTh#EGuL%+kg z;*Ami4XYzXB96>S+@kkSmMOc9dowIIY9?N)2`i_&uIiU9qfaLdLA5$~+lf18{M#kB z=ryrf%NTN*%0n`93PiICYskz>W2DA)w1_KXOeqL( zqDaVez0!UlD^s<{i#uHJ(RS-JzD?*I?#`##yehuS2_&7LpIp&=!xp?3?`#4sl=~HX zE5+|qu0z=p9g2gUjO>)yT{E$(T6^~6rThbjKNhhanNU6~sSjx)t|$1Ch}y)#0vbyF zMi4T~?lQnXpve8+ZmilG^Z_0^6EXR-dRWHoeOE7NSD?a>Z|+MBImSr`3ooj*m{!z| zq5F3|aYAm%(r#|JUmI<5OdrF1$RmQ==Uj5gBO6a5xIy~J`wH+F~di46ERj~WGX@!Uc8z$yxw^>yJ-dOS=rm(IB{Q|&_MF> zHW#^Mr!|}#3&+LKxx9)cF!b0{( zVG^^sgXE<%1e|2E#Z^ZP3C>CilgB05;zwh>e|Vnd%@|bXpJ<%UcCmiqgD2;~uJ{l#|PU?8nzAQ!v zMkvE9sGLQ*F2|oCxbM16SIn(FDo|z;FHWCpFtDGVtDeu2b882QVkgOs5llMFBBhwyM}|8 zzcj>TLTXF1_mw;x znHH4~fP%q(i!v^MyBiov*$mtDxNRXpZphv5B>;U!{TUFiW^(n(zRx}Fbn=~W;G5HB zN8?lzd#wKJ(y*P*yU98at2e2P=?NZmavz$=U!?G1c|tNWrAu;aYmiLb{l-N!1mh&G z;%I3v6dsyiJ8Jy+5yfLsuxhX%VaOOWU^XoqrfGFu1OR{x8(L9D0$tfagw&~P+b)Cg zeC8{tRN$bf+9WZ;cnCsNqjR+4Q}EWl21qE0iRNjMv|nlQ`%y2}Trt?27se=1K4F5{ z*P6b_gcZjQ#VD1}=jus!%N1a}X*&$IW)3qiFqT9yZN$cB@L?EPvFRBp#2c=3M^$bx za6unHI-Gx>#(1B{`F^f60^%PZrka3;7c`Oud{Iwnz^_0zNab$faBPg5J@o34$N&a` zmF&s*?kMiDfi3bM2T{tMnvI*ej$k7yj++^nnVaK-30gnE`WO|1*2lH6^mMz2ELEBr zgE)02=R%bbn@I~PP@2lv`IT1A87_Ob$U>G&4gqb&3#rnRq!FLOAm#F!(`U^xn^td_ zGwX)y!PfK_E6Qm*)$1vVCgJjPi>lUcnqiBJBxu3cYJWaM%{ms18(lUoMUG5~&w-1b z_?m&WGV{N5mg@qiQdQHH0(a=9mzL2^oaq_b#_2CZI|=(a$8m!g8v7|MpU7CZ`rvteG3w3aW{vLnITU#sy=J|w0u ztmBiHlyQ)vGTnvM707w^ltk@`GFhVY&7S?5(DM`FB`d-Y1Ah&HyelQ7I3s6&`Z)x2 z>T#^XY2828;2enIZk(_HJYdU8vZ57y=$F29;34 zmY~eweYWjMQcCoF8jwBCfePnmU*nrIX)wNQjP=i9f-NWWb6zaIgU=58jASYSSO$Cu zPVL%c0?YP%+`ts_@|%;fk7Hj+mLv?lBk%bropP!s_hiH#5LEU|Ea@2a2BpcYtTJ%b z2#x9J;`yH#A>EqoFnjv$h&Ed!RZN5Fv~4;LA)2HTVw;-IAxlnMyxUwudwR_jq4VAs zz7!;6FyGb>*{-MGy`!>ymO3cIPMw)rC?yt$T?=NcS1Vnl+T*leI{q!L_0;@~hpsY6 z>mf&aIzc(zhy7XvPG(!s^onCye)1z-=FlB&YCYM-g@7J=DFSY(Grvf?8@Ci5zxEFo zZU%Qv4pRkWZIv5u-ZyF#{ygsbJ#4_$(^1}`8qEwh2^d6 zTM-8nG6=gg4{`q{deb_OdXc+!c!&19@nn;lr&V#HRXT&37h<41uEF51#eJ6~6xF8%l5AEy#^?O=8nYmA3~ zQ&Ey>?3+oJ?J1f4*{l7|x2nv2tj-kc67GPiA2J-R6T7TH_@by%Iu~;?LFgKS#>rdk ztR;5)lS$T4--AHBZ>0~qm9T{;MMG)u5tq^c9m`2|>XGv9#piKZ(rbNn{8w%9NygWY z@YaD)igFj7OWSi189V*OqoyJ&o$Y@p67AP5p21T8;G9u?e=#WY^$~OxW#AeqLJK=O9*~Uy1IWwz8gs zWmh8s!%yeUs!(7YN^ zXiBUft!MREm>7WQs%`Z?GERe(Xjmdk9x}MxfT@juSgmlZW#gyd_L$EhmoAb1wWm1v zJroh?D4#=X!viPl$yq4_k0FVHm*I=os(2)Rb?!m{0As><0ZM;+dHaRcMxTB?9duiz z9ZA0JcZ#6Rad~*>qEGj3Y{PC^*|{8!qkRiUpYOx@SF)z;T~UG)s}&dyPYU;S6UdDa z#~EE6VvJ*pu-y4#0Ct^bN(2zOyAYPR{rwH-2gJ?Exq9ECa7EI-D(w2V&~PRqjb61` z%=u=XU9xeA5C1^Q zJ0iPN&T3!W%fLX?q1TMFP*kU zyuYZDlXvEBbw;_>qiOcI+Wfe3ef+@Kf=P{0RR9bz(1A9P@?hK5e@(VcBB3+MF~+X zSfDL$zila1^w(RA!5v{1M(AYJTZo9MA#CK`8y^Hc2D1xBM%N&jTY2}qRrlU6d?j@< zhez$D%nfN(hNL2(&k`SqnRR7t2%N`IkIPnh*QC)((@p6&Pd=j)s*Kg zC`7KvRk5-hWDIL{p&L$SmTbFJ*|r+SUqOC0zrx^DNkA@Z$|q7R6dDWZdw~R#?!v!H zukm__=~oZYjd*$FvCg}y5h!U@rYj4e_*0D~E7|E9v0ad6n6ZUchMPHI1747*cS$L1 z2q;e|sPKu^6`A3JLpo3jOvo)GK<=TylGh)l)TZzQ&4A~DE;@7x7?ZA?PhzLC1H*wO zs8k5_TW}h&fDK|^3pLtmQAh*h5q3$RPJ>6EjiJHv=CMR#SMXMf7%xgDK)JJ%SR=hY z#@a9Em)AyB6apTrXLSBa?_T9kDNvJZem%FinMBUVGnwpzyUE5HEP*X>-&#uir|^nj z@hl|UC7>Nq2F*Am)**irSP!=)&Zgc8C6;Pu?3gfB)%pgGZ!*ekkq}H+=H9wKm7+pE zjZY0SZYc3#@6Bb!duyD4R^N*(8c{_b3-2OFSr3pCzd`tup~qS^3{IUEE>0PF(&QP_aw8D0##ksq^!NebEjslin&O_21GrLm^nn}Fr8i{mFDpi z+KPR4^{+=@mRqm;dxv-6Zi^PbaOfCc-<^PK8= z4|BErY;0MWvQyN+D&ukCK!eJl+rY&QPOtAKF*Qd<2RROyURvDIh(IOK?2C#)&1GMG z1M#(-Q5v!zBNK$Xu*mX6Fz0GYVdGj-kg(l^&rgnA#NN81T*@$E>2$TUT6a;yh{}cL zI#`~v6}5?7GH$=vP{XLNNK-2O(JC2Jl0_i=l0Z|)y zJ$TDzJ!-~wFrV~=T;_F@r}XGKm>TM6x(LehaFw5yl2rvKu2nYbs0-&FFDWDOv$dH&}n*a_h$y+aYk@Z;0 zm>8Kwlf{r3nH9lxJZ61mPpq9&!R%tx5eKI+p_5jUnMkQoVY$$`7DqNQW7}rlz1Bky zm)ijnF5}na06?a9^64Z3?&7Qzt#V66^X%MC5-K8mf0-Sxhr}aM%YJ_jRSAGv%%)DX z;2Tj&rlA-)D)a9JnQ>TYUhX5S4qBn=KU`@oHC91=*SeNR&7R+6_7b(!7r}l!rvJA zjwP%p+MMY{Yd>~uE1`4^mk}@`RqyE1lKY9rqE+keeJ9?5lh-g_WdHYn`JY^Q*~bh) zE9W%fIA^%#py=~p*gsLI)V$K)Og{8UeHP#o(1n(>v^Qnu&p~fgPaO#*gvOpB*^Ea)}P)BCwH|HB6Z8 zK07acJ8htQ-zsmns|-$b%rk%Qevs-P0_!Z!lE=`u(#w}=vewpo5K>8^rq_@xkQ+N@ zQpm_>D5hH=ih51VtiI_IPmC%SEObHj2Xk#DG@3gBPt>sMPSxn$-4s5=1D!CL$d9z;<@Ib93P3 zS*MdF2tOGB-kNYoIz!`%pe25JmyBm>gy=tCg8zHttei&kb7nw67wr<=vAupmbMTP; zFIP^HNFPXjRw^SeXROg3Zf z-`aOzAzIMh_V)@h1-q=OQN)1buVtY3y*1{(?ucihAzb|;?Vxk@uUelA3WhG1xc()1 z=0G7!ZHrN4DbEA(CD_ROz4C_^b)*knC7nX*s6Ejp?4wQsnU; zkV`rz`U zw9Y)rZB?p=dgR>C~T(0z~%yEFPqy z7m4TM8Pb~&Npgu0J}m3FD%fCVO^kxkt8V2}{VxO(*6IhG1nEPO;`0#!vbkf^su-(Sn#!vuFjB6h>KVO9o6_kz>Ni*uTt$oRAW!*z_FV>1$3X3rrvig9cF zo*}|4=kjhwl^>0)0asU22d4aXSJ3$V$YL zWBtZMn`h$VK7c!Bny?}&0G{2j3ADW_;qH=YKt>lQ!m8w7j%ILi?n&RSv&jg_+uI0> zGv#t8UG9~KhgI0)Xms=0$(qi& zg;|g13gC1zyp6BKl8wjv3s9aQ1fzv%PA~QBqj@1!rxrdn6@scfFPD4x_EVWBZ&+HS zK?Ip>_}cBUr{rX|X?=CGxoCq!kbOA%mqqdedQp9#c^mU2+B11;v8E zL08sMfb(w9&kzFW>pBmR!2ORThbnI{0gF8UMiVx0sQz;`2>>AdpYHx#YyIr~|AH$U zxPQU_G5g)9{})93_7|kzt?{f#8zAvb_=!^M7aQk2`>hZj%BHE1|mE0{>V8XzHI^$!6&D z6d81S8=LA+NDCa*GQ#r5Al&?!lra3q%Y={q0U4pJJ7`q@-%b3Rto3^u|8H-B1cm%J z{7*U6`NRBkQvaL%lOl=9zwqD=_TLJ-vO@}7`&Wo)m-z2M|5 str: - """Process images in markdown content and upload them to S3. +def download_image_from_url(img_url: str) -> str: + """Download image from URL and save to temporary file. - Args: - content (str): The markdown content containing images to process - bucket_name (str): The S3 bucket where images will be uploaded - file_name (str): The file name for organizing uploads + Returns: + str: Path to temporary file containing the image + """ + response = requests.get(img_url, timeout=10) + response.raise_for_status() + + content_type = response.headers.get("Content-Type", "") + ext = mimetypes.guess_extension(content_type) or ".jpg" + if ext == ".jpe": + ext = ".jpg" + temp_file = tempfile.NamedTemporaryFile(suffix=ext, delete=False) + temp_file.write(response.content) + temp_file.close() + return temp_file.name + + +def process_single_image( + img_path: str, context: str, image_tag: str, bucket_name: str, file_name: str, idx: int +) -> str: + """Process a single image and return its understanding text. + + Args: + img_path (str): Path to the image file + context (str): Surrounding text context for the image + image_tag (str): Tag to identify the image in the context + bucket_name (str): S3 bucket name for uploading + file_name (str): Base file name for S3 path + idx (int): Index number of the image + Returns: - str: Processed markdown with updated image references + str: The processed understanding text for the image, or None if image is too small + + Raises: + Various exceptions during image processing and upload """ + with Image.open(img_path) as img: + width, height = img.size + if width < MIN_WIDTH or height < MIN_HEIGHT: + logger.warning(f"Image {idx} is too small ({width}x{height}). Skipping processing.") + return None + + image_base64 = encode_image_to_base64(img_path) figure_llm = figureUnderstand() + # Get image understanding + understanding = figure_llm.figure_understand(image_base64, context, image_tag, s3_link=f"{idx:05d}.jpg") + + # Update S3 link + updated_s3_link = upload_image_to_s3(img_path, bucket_name, file_name, "image", idx) + understanding = understanding.replace(f"{idx:05d}.jpg", f"{updated_s3_link}") + + return understanding + + +def process_markdown_images_with_llm(content: str, bucket_name: str, file_name: str) -> str: + """Process all images in markdown content and upload them to S3. + + This function: + 1. Finds all markdown image references in the content + 2. Downloads images if they are URLs + 3. Processes each image with LLM + 4. Uploads images to S3 + 5. Replaces image references with processed understanding + + Args: + content (str): The markdown content containing images + bucket_name (str): S3 bucket name for uploading + file_name (str): Base file name for S3 path + + Returns: + str: Updated content with processed image understandings + """ # Regular expression to find markdown image syntax: ![alt text](image_path) image_pattern = r"!\[([^\]]*)\]\(([^)]+)\)" - - # Keep track of where we last ended to maintain the full text last_end = 0 result = "" for idx, match in enumerate(re.finditer(image_pattern, content), 1): - # Generate unique identifier for this image - image_tag = f"[IMAGE_{idx:05d}]" - - # Get the full image match and its position start, end = match.span() - img_path = match.group(2) # Get the image path from the markdown syntax + img_path = match.group(2) + image_tag = f"[IMAGE_{idx:05d}]" # Add the text before the image result += content[last_end:start] - # Get context (200 characters before and after) - context_start = max(0, start - 200) - context_end = min(len(content), end + 200) - context = f"{content[context_start:start]}\n\n{image_tag}\n\n{content[end:context_end]}" - try: - # Convert image to base64 - image_base64 = encode_image_to_base64(img_path) - - # Get image understanding - understanding = figure_llm.figure_understand(image_base64, context, image_tag, s3_link=f"{idx:05d}.jpg") - - updated_s3_link = upload_image_to_s3(img_path, bucket_name, file_name, "image", idx) - understanding = understanding.replace(f"{idx:05d}.jpg", f"{updated_s3_link}") - - # Add the understanding text - result += f"\n\n{understanding}\n\n" + # Handle URL images + if img_path.startswith(("http://", "https://")): + try: + img_path = download_image_from_url(img_path) + except Exception as e: + logger.error(f"Error downloading image from URL {img_path}: {e}") + result += match.group(0) + last_end = end + continue + + # Get context + context_start = max(0, start - 200) + context_end = min(len(content), end + 200) + context = f"{content[context_start:start]}\n\n{image_tag}\n\n{content[end:context_end]}" + + # Process the image + understanding = process_single_image(img_path, context, image_tag, bucket_name, file_name, idx) + + if understanding: + result += f"\n\n{understanding}\n\n" + else: + result += match.group(0) except Exception as e: logger.error(f"Error processing image {idx}: {e}") - # If there's an error, keep the original markdown image syntax result += match.group(0) last_end = end # Add any remaining text after the last image result += content[last_end:] - return result diff --git a/source/lambda/job/dep/llm_bot_dep/loaders/docx.py b/source/lambda/job/dep/llm_bot_dep/loaders/docx.py index 3c505f37..250db7c9 100644 --- a/source/lambda/job/dep/llm_bot_dep/loaders/docx.py +++ b/source/lambda/job/dep/llm_bot_dep/loaders/docx.py @@ -1,6 +1,5 @@ import logging import os -import sys import uuid from datetime import datetime from pathlib import Path @@ -14,9 +13,6 @@ from llm_bot_dep.splitter_utils import MarkdownHeaderTextSplitter from PIL import Image -# sys.path.append("/home/ubuntu/icyxu/code/solutions/Intelli-Agent/source/lambda/job/dep") - - logger = logging.getLogger(__name__) @@ -118,15 +114,3 @@ def process_doc(s3, **kwargs): doc_list = splitter.split_text(doc) return doc_list - - -# if __name__ == "__main__": -# import boto3 - -# s3 = boto3.client("s3") -# kwargs = { -# "res_bucket": "ai-customer-service-sharedconstructaicustomerservi-wywyift3c084", -# "bucket": "ai-customer-service-sharedconstructaicustomerservi-wywyift3c084", -# "key": "workshop/CATS.docx", -# } -# process_doc(s3, **kwargs) diff --git a/source/lambda/job/dep/llm_bot_dep/loaders/html.py b/source/lambda/job/dep/llm_bot_dep/loaders/html.py index 0c2c3a9c..0240ac08 100644 --- a/source/lambda/job/dep/llm_bot_dep/loaders/html.py +++ b/source/lambda/job/dep/llm_bot_dep/loaders/html.py @@ -67,7 +67,6 @@ def load(self, file_content: str, bucket_name: str, file_name: str): html_content = self.clean_html(file_content) file_content = markdownify.markdownify(html_content, heading_style="ATX") file_content = process_markdown_images_with_llm(file_content, bucket_name, file_name) - print(file_content) doc = Document( page_content=file_content, metadata={"file_type": "html", "file_path": self.aws_path}, From 126b788db808aef0861df82d9de469ccc19db20a Mon Sep 17 00:00:00 2001 From: Xu Han Date: Fri, 17 Jan 2025 03:05:29 +0000 Subject: [PATCH 2/2] fix: add proper error handling for image understanding --- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 59019 -> 59005 bytes .../lambda/job/dep/llm_bot_dep/figure_llm.py | 24 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/source/lambda/job/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/source/lambda/job/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index e641f2ba1e9b63547bb2ae08d6ef1f122c8b8d94..8ed890ae5fc787a108ff9a18f95c8eabf02532db 100644 GIT binary patch delta 6101 zcmZ9QRaDds*TtEkL6i>Z8ahV0TT)s|x(0j@kPhWfcgY|i-JQ}Q%^(dT%?v{~NC_y& z=lgtd@tvFfTYK%Rb8+@sCvF1sejKx&k&`H#8fun+a3JO)!N3TF^Ku3P?YrJnhqbdV zw+hOXHObup<>((`!|6IOr8W3e?V6LgrBb(O4kW?om2fI%&>gw%E%}UN)k)h~pp?4> zYzA*{YDGAn;HuY+bS2ga>ua)Nf$SZo@Gvi4*zPabi(nGrD}Ul4&J^ypEMJ69$B zyjaT|HC&O&s2f)M-H8YAcqfqR1Tl{an6W3m0a-~IT zTt4{XOErHf%75khzVrTM`()=mb<}Q7XWpswZ6R=#bnjcM^9tybSdlxuI+Z?=>BgBx z)&wFau0gJ_`>H<^Ru30v!x5M``#rzkoaQ7bNMq=VJ0+jY@!(5)U-g7I2n(Gjjr@bYfUeZh)6n)m z+)Jgmo&saO2Y_pL6#09<_TQJT59x>ZYioyZWcR*Yr=P&3?q6A5oyhDXQH$M^Kfz54 zwa3!PMgL!Vr)N1CgIJhKZIq@7+HRYg1mXASI#wcX>4U9RGCq2;E%^km= zK--g7iX}qT{bbUPUn!M3M90)!B}gIQ+RX(xGd<1G1g2#Fend?)4AnZ$&)%851Y@mllby)gIk(IAl-NtHIQiys!MGLmb zQ8h5zb3wdExg6NlaF5104=_2XdDwq-FSoksDAY?vJ$*G5kS@Tyu|oUhALHL*r@@Zx z@@U``FkWagB#Y>Nos2r38p&8oG&{x<=4LR;_zKBKjn|O zypL{q|C?~o*g8_rr0UjE*|<_WaU$Yjiy_m($Y&ARl%#UC(-Xof;lSu5RkjkedUg4B^K$ zNdf7M(B&Y^diH74(Uz07z@H0p$I@T0ml+&jE*G{BMMT6Nh{)Ryhii9~N0=jRUJci1 z=96<2HdFrfQp6vQ)=h`Ie2@&1qX!c|6hBp7Yc*b8m`wcwKR0{S7O;1 z)e63JmAd!rpqfMG5b2B=`s&q;E*$5@SOd|cFtY~t>VUh;3UyE0*YA8<0;(mUwTnAF zohnG@Cd}9K<~)ybIt&^GNoi&@uzsscL3SWzuu{Y>EpA^3$isdoo z+>*uSg(_wJ$Zi4F9mL(3V}BD-4*+jJCj+d7%IrWC~&+SjUUTDS}qp^{t}o zIw%vF2(cmJ{Uv_Ojn2W!D@EfMq3rQw-Aw1|Qn~CmxSuO#sD;^%iJh1X3}&B)fTs8) z3NRu|#uADCVcIl3e-k-z&g*;iVqBVuF?pJ-cCLkshPknOSMTeGcUc|mk=mSo6l_{q z?9Ua#0)zn!Z{jL!sU|RPo=uh+I{Mw<6DDV8kasUsD1JIQ{w>QJGmcVxmP9RsCl#** zJSK+XPJ>CU%0%(D=?-giO=iQ z{kp=KA%_NP^0a>TRptyj5NoD}2=y8)RbRBh;cJKM!3klim!6I-@ z#T$-ZYdWbm6y6tWBE@Iz!N&X+;$#}bED7Ng#IBX+XByk=NJC)WM3kUm(oW&{EO!|6 zMfRjjeF(k})?^2B7)3hjZKs#?s$EB^Qg9e790tcA|JHF4UUsGH|xPRF|W8w z$hJQJa|LRG563t+N-sfY7qa>_Ma1}YN@GOj4s{TZt#h8fhA=Wb_As*Gz12DDMUbxe z2d8b3N+Gzlc?susn^py!;Dk~DgGyy~3^2fA58x>%GcPp}g?@?F=YB&_KZyk{pBsGh zZu4p`I&x0z(stfHe%kDLC|NLRY>%@F#c`Ane7Jez!W4~(*rA7 z#zM^0o;Vm)4;9LbO@TRBUfk^Z)XDR)frrCvCCg=r&}mGR;v(+l)S9EL+sQ*I=Y8v8 zQISs=-qm!yv-T2~fW*mW6Rt}f13rxu2Z8&%sD7*}BIa^I5<(rK&iK@*C+Vt>w<;Q) zYcfb~R0Xgylm_a)WVSxfcv795BvJL_D2thL%U%BMv2>!n16>ch5n~`+!&Um3vb6QH zIX!WTJL3(-+GPr^BZc@?M)OyYYovFg|tUm>O^qPGC!dmQ`4`jLMnWugdVv~u(1gRc)2+h@35fs zV>Kd9k_?IET^k-p8Y3l*@gJnhJR+HK2l?TBitCaP_jJej5WT{yelGx*;3L~{=d^^$ z41z6)<0%mXiC2S_3bHjc5O^Y}w_urRwOKAj7@()w$a-E+&ck)QooVLkodUDObVda$ z3<>R*r0D#(P3V37tj9{-z=*{orPf3jIjy1XD3KRwdgDr2VE-W&HD{=Qu2Ko zZL7>Yn$}0?_H#8-x$m`EvhJ=5dA*UE?gGL@B1l=R`#Nl6krA#y1+d-6-#-a;ww24lW5_gKUr-QW%+QMCUAVio<7wKm=$&wA})!~flqbaojnbuQU zWa_gLl7_in-zIZ0ENbQA;8u6l=A0qgWEes?E|pZNL3T<GXxiPiW@NAW}rzSOuh zt?0lPAG~oRqXuuN$A;g6iXvZ(6f_k0ja~LJRKKu{U;eBMd;>nbp40sa2}=G!moxi6WPEz9&>+Vu11O}X@S*Jb1!lyaKqyaTc2V6m0ewMyc^APQDBEzulR zHDTvpQ6VdQ-W+|nz-!#}tll8wdv#$YrA305Dr=v>Z%jWjj^9sAv*`xp1B07!ET>Mx zztgZpt<^%EfHC|E@ViGpaxaVQWU;V(M8smR{2mSZH@truAfQUzn2!^uz_CrV#woFb zI|!4Zi4|A0ZC(097qy4K!B>_{`&%2^*4;UmaY)9b#i0DF+-4Wp-Y(hi@7)M@Q8pQt zC=VdhqHpHbIj^+`i4IVTq8Tf|m-Q%Lt4_%gntCfj$9W4)K+xHXaL7iO> z;pnu+(C^O}yCc1b7+a*an~}*~)l6KcdHkn&e5Z9i=10`9S44nodsmrLZc!B-j5m35 z+RThvFd4#tLmDedXR(?f7S2s~+Ah7gQk98@73k`b7`|D$`G~Lbm2K{JlLj|~}Y%qC7 zikT&CLF~Xs8{y!5;z|~ixEBl-Ll%RRjcFP9H^MlUHerUESc46*C95={sTn96^FLh! z;`pMrQ|h+`t+N9@^l=c%rZ)%QqKy(wi67#ec@ek0F@V!IXA*ly<2iG=;(*b#6!j)u z8=d~Jti1sX5{UQPuBVlG#P|`Hpw$wSgj6a$3)T6LYISmqzZtEoN zRD7Bw00ykSC1`C?&I+)d+oaoXjZ) zU1ndi-s#&;IfwEdHEQ({_vNjG;r$|NhT0WjQUcf26hWWRT@_BP{3rhmt9cX=YT@I_ zj=H(cN`yzFcWW?mY_i*}gAajNZ%Ot~OeTGSg|V`T_#(C6wGieyKYg?ezcg(nE~K=Hn)U ze`{uJBC%ZT;i|>hjvbR$(psM2bcyawd_1tCCx0kxy+F-XxA(TrZ<^_u1-@NqMwA)5 zkv!CGy<#(30qeLv)U263B-cNL_tHvOXL8s$5m{K-9-`;s z*go)=kJJbqjJfTJ4B7Sb393EHn0|C2F)XJgfqWm_I^)B~NoxFKfht3;Aip6da}Ee< zBTx)BM3?xFC77(5=*B4w_xvcrnaZz`l0BV zLeh7^L!o}vMLx#K>c^}gG0nNAsvT@;;nVTCWu&}jn3v{}X*8tpa9N+c`FT{NU=3oJ zI4{os`0G)kL`s0AGkh`f*HTf6kzD)B=2fV=;Knzn7bnXjl#2%(AKEK-FmHi43Z96E zc1>N+#D&s`cppJLn1O0L1465m%+ZDCmwOB?Cl@?yU|?wVZ`0H3_&S^Kleeq)eCDAR zh$-OlCOiDj>cdqB4EId;oJ?ySoSrs$-`Qh&xw6BR|8#}hwYmhe zgO)Ipm))knk{a|n9C7`J??)sZr6rR*lX9A_wGs)xdanvr$mQWg>S24b zstC#t|K+>)9aHxniT!9_#wX$Ji8q68tD54h ziLv$y4yZ^v`vjMbuH(TPXzSqU=?y;-d+qn@NlWhmb zOWVW*MJV_DMEMk6;MY8T6+^zm!$*=@hu`ao=sH@JVhLW&h|WzeSsAtohML@L9~Jl5 zcW%_e{)>a?bf{&%GmTKuaV`fvO58`%GHtRIHg-Y75uIC15|t(v6~&_Do^899pdROP zBwycU#;j8UOHYRZ7sV~BW~qWM%N*1uQfGQ2RH0Gl(=va6)Mu3;Xsm&HH?_skQd}FV z-9F4g%%jkrJR|JspUUPDV>0bG1`pFOcO(xAS&eCX4SrrkB2S7B$anN!Gbx_&4s=oM z9a=@oS3(0t*ktMxMAavsjA^dNeR-z%7KmnTn%%M?Sn58nR)K0hJ4Ic;dHD!)FQ~7Q z>O9)FDOsBr2$Z81KsC*ljW1->IMaP4wv3;P7qKQMIys(_8SR1> zbc*L#z@HDFo_IGqKK3Z5^p^UuD;!gq{n{V3EFHtkxkBeu)I(JPLMbVT5{K@}wpxDI zOr6KU_FN%uw&sm;nsG#yY}5*Y7G_YC)XRrwx(M}XX8+Q(&C9x8KxvFYl*h67B+I69 z^3qN|T`lTz+@5v~Z(50A{&VeO@FIu)2uv{&AZEK-y3QN-zP|ARxVv0tZ9A)q@YJR8 zbsz1^J7uJ7LYB(g@W@fDtv+}$c6l6TQAoa!Su+=UiDR$JS^q7<<7DI7#}#_hr5zqh z%d2w3@}rSa^T?M6yRJdnL0QE-Y%@ErkH-DI$eg%oYl!p?>l>J|v(u7XX*DlJ4x3_t zk!Ynwl|$7njGe2{erW#S9?h2G27F`3{`lyu#2;~jIu#;oZrEfW?q76I2x`|Fhh6Me zT18!E!yWP)9}&!Q63>`rkKzkEwoFL9H~8_=Y>>p&|GP{-af$=VyM^Ao@aFx^Cil^A z+gm2OSa!kz>3391I!Ht0lcmUgUz7rHR3xJ-ugy>=_}H;2e&{)sW>njktO%ZeP@_w*2k+k*%mUz3_QK!7NcVjv#+)rQ;OslTpZpID9j&`)t*%w-4i> zg#fpnQZ`~4!l&EQG&OJTK3|RwsWBALTy6{vD6o6kftMG~TE-W65H9cR##ufvQ7JJ#){K%+JCGwA>Q zo*CCnz}^z@90@LX;TkvS3w&Zt8QkXt-*;k!@2(NUFMWs!G5%+oLYCncXmyYWybLW5 zJ_v&A0hI6uv1&#gLQ6jXUMfB{EO?GP;hYlzt~-m3x{rT zgR0@?Tc+To9{Bw?I{56C0C?`(U*v-GZ9n=?KS^!VfuGO8^>!HG@pE`^;J@93zi!I> zzwXF3$$xUI+jJm4xb6-koO*|b@&6hP8v{dt00V>Ne|PX#n?Rv|b0RqU$p4uC1)Ijh AXaE2J delta 6132 zcmZXYRan(go5kVKa7by8?(S|xS`d`(@PKre@IN%taOiFnq@+ubLw9$#(j5}x=gi#9 z?3=aLZ@o8r-@MPWABT~mMv$thS+M!=c@0MREsX^65fDmY94vmoR*u`T&x%I|r*uSr z0nwx`E)nI^?(O+6`+dLjg)0i|KD++8lBuGJk5OT+opHG!wp_YJ`A#b*%7rRW*KH{2X7)myJUmi8>Qw~Tk02t5S09$~3_Fg1bI%iD-TRKEeN%tvDLumT_OENS+_;5+qyBlkr<1kj zHvebCH(!C)J1W=9jh*AE)<@II8vv+kyvw^;HEq#2zI5`RX}u}(^0WLSR)}I#WxV9m zQgd-xxVk0I8-;-y=FIt>JbSe_m8FwC)ob>I(N7fFu>Jh_x z9>xi8y#KcF<4@k7%Nw@)r$-I*>YLMp)q6NZqcY0lbkhIw;CT0amftC_J%KmM_q?#| zE2n!eD$tsoUz~6Q^>SxBRf_KcaIf=>d(*T4KEwd;-$UAmQt{4?Vpi+RA2YpVYK>@4 zJAj*VJEOACa;AbxC)ayc!%m77p2*}xxa{hw-H}I*#>+nUvM005yb1#8BjsjHHhhNE z1rBRZpfjR0bHdq=KS(B$zi6)uX+g6CKX!uS#ismDCKY8wHdhhZu* zLzv&ZhwPR)WvS(I2~=brtc*0aXjC3QJOW2g!ltfK;UhuBe{zlDa4AZusD-X*?4A6l zTOT6Mt|_3LfumXt9?z?_9Z$A9PGE4KSqzSTya%i$reLKdedUv!nfzOB%$>du|T(m`8F%xz4$XncZ#4m^6-PL2Z@ zC?LHbhx64q-*Xt(K9F*bx$3TUA0oMeKAU&dFA6czTg~1`VEQCFX*YI{V=z_)CE3H^R61BN+U5E)|&PU9F}vBkHMOw33*2~A<&pCX5OqHRH zA=n^2%bd+lj%4x=*P%1tZZwk-!{=n@ebrZlvfD}2k7DIxG&c!F=)jln#odynSHs&x zyCq|UK@ksE2nK}mO|W^EI@O?_NWQ%Su$|OyQwdeOx0vC0@i;?Kg=XpRVxhD_4)eD_ z3@8UhA4%;YzXqtOqiIa0u5ILA>p7>mzwW272X+~mAoE_yB6r|(*H2rP8p;S z&RK0W*uzHpDxa@RhWQ;4f%FOej2I@Yv!-qtYHb1bPANeG;C6*zxe0*65Nt)Q|>}596S?Lm$M7bR(Qf3@Kd?VtDHS_0;8jI_9k|USLZi^E?uWn$QlEmj6#P?BPxnl(|&-~iWgZXWN-s@$g?0dt5Igaz(Xg4 z{8w}R&!nWe&PMg)PH2LZD!aEO&fk?C`uMbYCi`i zDE7z>1sW*fTI+o4DUfL#3u2{HzaBd>`OA=F(R$o9{mDByW$m3muAA{TBqa=eC|5;on6(4{Y3zhF4*GH2lnZkNqx00tj>v2PrsPc z3l~dalHQS9mKoU|`tJDxrQB9l?ug`Vrva_3F@+`LJ^sq=DsH4DApfAC${8n2qFH$C z-LTWrB9p*`b)mSHMvAunpRE3q(E`S_jxPY^rLnfh#zNcsXM5}CA68M;c-TBC`@&SP_}+Q_oSbuN*W1*Ov=Da6&$0jXES%zZrLRAv={ za~I-t9%22!hOi@e(N(Stff}ie`EfrD7ctOyF2+KV3hAtUUJAQCl|DjWGq@)Yc0TKr zM0tU}5MF2FMRnSj0;LDp4x%E2q8Xa@E`OAu8D=S=M2tmL zBgE&8+l@%8m?qaR?SI}C+4_{{iIDIUqDI>dBbO1FI#c8TCfpW|@B4I(m!7B2pFa3> z7_S0yhbc}d&DK(%un2Oym>B4^r!G>;PI=6e~h7Sv|%~3`RlhVm=3gLMfCr#GL3Lb-ga=^O;aLvgx}V zII2#fvb=jYNaaZjZi66C`%a;nI8&k}vNhtR4JYr5Q(JIb7wXp(^ax8L)nb8LYwWh)TT~ixc@nQ7)*(FXXUPEL_s(-dBD8ilf_t}x`` zk}Li?k-~I74Z2?V;3=G~5n&UsiaE~Ps}Dvkf1`r;%G8ojjNOjXw$vtm!R<|XsJNPt zolUbox=K@Ch(bcXj^`K<%7M9@q<7Hian{1=aGdXWR(dbPU}~Pl7Af^dK14{sWLEwiO02j z%z0ILZRc%U$j32A(MhZUnNK2$U(V39G3c^|tPpmHZ&F~X3%JjQzWBM0l6`)hGw-p} zE_utLtp3_?r>SPh_tLYLe!&!VddVav8P9$niU_c_L)@l$MMhKR{pb}o_B2N9%MK+eFUAhk0^u{tqu27Xt5HdXAAR5 z1qrxdBNP$Rt<}M+45Fb?@U~rbr6*P-o~C>2;-I3d>-TH8P3pOp9}8=ZJx5=p5box( zk-V8sS9)S#Znb#@go1}c5=|us*2436)|*nGnO}MXD&%SDzME$67hr)Gbn;4BQj26F z8C~0#%+0`pM!x^2A7i|H=#BoY6j|eyw<@YUalz;P zmIgwFiivz$wZ=2v?mY{WOg>d!MXfP*AA%)KX}@>#znL2sQwU*J^jzt)3^Q*UPIUn3 z&%^Hpard}&z7am{amkK|nq(65E&haNsWb2RlJcj&Nvo;Yhci0n6{+FiK=cw0LVtJc2d{w*#mL z%b3gqp!3AI6+WaCJS$w8$kn_A#*)|2insHSkN#-kCPn(~Nv)b`sY)bPBq^3PZ7fR z9R`}6; zW8Alvd}7&j;G|oOzia1IFm=U^4ZruuWU*&V5 zx-dc@-1@7t*QXs7T;*EDa^_NFyezsk3RHC}A>LbuXo!tL@32kl-rI2D*le7=;)|n! z^Q!0&Dg<7ymn#eFsy{JV?i)DhIK zp*DohU)zA%?EyEuEHjq}Yz)KkOPa$?32&%3)cEvdstjR~>T{Iy59YmtX=(T5=IECF z{u$KMq{lY%M)9v=p+52C5tmiABRoA%DDE0>WNeEbM0FT?r6P_?` z0iQIvhN@3PGR`ShtL-OCWu&O%k74h}wh>QkXtM#m>tizR)w-$D;kmR5TgvpB9F82K zPxfgs9i+V*%i^BDB!>gMhDQ)R%U;h$9{W*TX6S1POLsWrO6l+1%vYC)^{iyh4oS-K zb&I<_lAI=JKl)K(~PWcSBs zuNh4sZ$qS$=oImCZ#j`gcNgVYFsKY8m8CbWJRUJ6`8c-KZAOm}1F-! zgxR-Akir&KtOhwESnKCM&5}P^$9wfsyjygcu3bA}v5TZY+pfu8wz}V;j{?U{->9hX z2d-g>@OX(pz-m|CxFS86vP@wG=YdtdfUVRZr%_0Q4r@>dZP9xrY8v4rpN0bcr(3Qe z_isiOyWpT^hX@0n6!|)v+G8!3xpe4(4S)qcy7Fe(V(5;Scst~BU)A|6GejlC{-NkN z)4c4>rKJpzsuX@XY?WD#pJzff`@M1=I?t>%0FevBW4KzlaoTP1`_w8;9ME7}v(oc6 zgdqo+CCr^hKyAPUX8+Cz8S^>8P&}l&X6!`;{5Yl4ma`!?%Tvr~%4ZvEWQbiie452Z z+qjw5;W*cl#u_evaLjhqo|y|LsYk z@L2A)>ZUy`xT1GOxbT(TAl0K@5Qp4I|?I+NbXgy73+FrX{=Z60KJ~8eSR|+`8Rv zu^j?`EbeG1BN?&a5)Xt68Y}Ns+U?<&;%hJd*x(Yz!@{wmgUL88YbJUoX}LUeXu&_X zawDhn%Hec(_oLmoBz~V~?0~XFtLZDPX%+$ZmcSW=-W{EpL!T|#1uH6u>%$Sn*drn( zIM1=%8NqHt`-{UQs6FL+AOmZZoJgWN`8Qy#?5CqS=Kf6u637RcitKncahj5?;eNpS zqY@b)6U7wT0luSA>XN+;MBCc0{+gYu@HX?Bmk3+ahddP1-y{0MZG=*dL-J7TNrhHG z;~~Lc%Ub_g3J(+z9-*D{HM3+jJf$En6}FrB{tAyr7`OUL+*;u+)zdsGTLZNKR=0<3DC zvet1W1@ZsVThYbAPm?) zzzsu(*>19duG0R+@=ab)LDoNexyc9G%ll`gwgf>5g?}w2EMbcclv?qRwpXCR7XQZR z>VJ`Pn-|nn4?CMAg00k(!Th(`K$#tXBPFb7n;Wz-`PZ_-sCU5sSyG%klpw7cSmi7g z3_633qp6ArB19tmdogr`zb){$&6)k%(!)Ccdw%pU5uN)tkGV@m^Z)kS|GvpztwW5R SgXPRIz#zMLsAenwY5ogzPNlg3 diff --git a/source/lambda/job/dep/llm_bot_dep/figure_llm.py b/source/lambda/job/dep/llm_bot_dep/figure_llm.py index 5d21dcad..f30b2574 100644 --- a/source/lambda/job/dep/llm_bot_dep/figure_llm.py +++ b/source/lambda/job/dep/llm_bot_dep/figure_llm.py @@ -60,7 +60,7 @@ class figureUnderstand: """A class to understand and process figures using LLM. - + This class provides methods to analyze images using Claude 3 Sonnet model, classify them, and generate appropriate descriptions or representations. """ @@ -72,13 +72,13 @@ def __init__(self): def invoke_llm(self, img, prompt, prefix="", stop=""): """Invoke the LLM model with an image and prompt. - + Args: img: Either a base64 encoded string or PIL Image object prompt (str): The prompt to send to the model prefix (str): Starting tag for the output stop (str): Ending tag for the output - + Returns: str: The model's response with prefix and stop tags """ @@ -227,7 +227,7 @@ def process_single_image( img_path: str, context: str, image_tag: str, bucket_name: str, file_name: str, idx: int ) -> str: """Process a single image and return its understanding text. - + Args: img_path (str): Path to the image file context (str): Surrounding text context for the image @@ -235,10 +235,10 @@ def process_single_image( bucket_name (str): S3 bucket name for uploading file_name (str): Base file name for S3 path idx (int): Index number of the image - + Returns: str: The processed understanding text for the image, or None if image is too small - + Raises: Various exceptions during image processing and upload """ @@ -263,19 +263,19 @@ def process_single_image( def process_markdown_images_with_llm(content: str, bucket_name: str, file_name: str) -> str: """Process all images in markdown content and upload them to S3. - + This function: 1. Finds all markdown image references in the content 2. Downloads images if they are URLs 3. Processes each image with LLM 4. Uploads images to S3 5. Replaces image references with processed understanding - + Args: content (str): The markdown content containing images bucket_name (str): S3 bucket name for uploading file_name (str): Base file name for S3 path - + Returns: str: Updated content with processed image understandings """ @@ -299,7 +299,7 @@ def process_markdown_images_with_llm(content: str, bucket_name: str, file_name: img_path = download_image_from_url(img_path) except Exception as e: logger.error(f"Error downloading image from URL {img_path}: {e}") - result += match.group(0) + result += match.group(1) last_end = end continue @@ -314,11 +314,11 @@ def process_markdown_images_with_llm(content: str, bucket_name: str, file_name: if understanding: result += f"\n\n{understanding}\n\n" else: - result += match.group(0) + result += match.group(1) except Exception as e: logger.error(f"Error processing image {idx}: {e}") - result += match.group(0) + result += match.group(1) last_end = end