
    B                        S SK JrJr  S SKJrJrJrJr   S SKJ	r	  S SKrSSKJrJrJrJr  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJrJrJrJrJ r J!r!  \RD                  " S5      r#\#RI                  \RJ                  5        \RL                  " 5       r'\'RQ                  \RR                  " S5      5        \#RU                  \'5               SS\+S\,S\,S\-S\\   S\\   S\.S\.S\4S jjr/       SS\S\,S\,S\-S\\   S\\   S\.S\.S\4S jjr0       SS\	S\,S\,S\-S\\   S\\   S\.S\.S\4S jjr1      SS\	S\,S\,S\-S\\   S\\   S\.S\4S jjr2g! \
 a    \r	 GNIf = f)    )basenamesplitext)BinaryIOListOptionalSet)PathLikeN   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encoding	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc                 d   [        U [        [        45      (       d#  [        SR	                  [        U 5      5      5      eU(       d$  [        R                  [        R                  5        O#[        R                  [        R                  5        [        U 5      nUS:X  a/  [        R                  S5        [        [        U SSS/ S5      /5      $ UbB  [        R                  S	S
R                  U5      5        U V	s/ s H  n	[!        U	S5      PM     nn	O/ nUbB  [        R                  SS
R                  U5      5        U V	s/ s H  n	[!        U	S5      PM     nn	O/ nXU-  ::  a  [        R                  SUUU5        SnUnUS:  a  X-  U:  a  [#        X-  5      n[        U 5      [$        :  n
[        U 5      [&        :  nU
(       a%  [        R                  SR	                  U5      5        O+U(       a$  [        R)                  SR	                  U5      5        / nUSL a  [+        U 5      OSnUb'  UR-                  U5        [        R)                  SU5        [/        5       n/ n/ nSnSnSn[        5       n[1        U 5      u  nnUb1  UR-                  U5        [        R)                  S[        U5      U5        UR-                  S5        SU;  a  UR-                  S5        U[2        -    GH  nU(       a  UU;  a  M  U(       a  UU;   a  M"  UU;   a  M*  UR5                  U5        SnUU:H  nU=(       a    [7        U5      nUS;   a  USL a  [        R)                  SU5        My   [9        U5      n U(       a8  USL a3  [A        USL a  U S[#        S5       OU [        U5      [#        S5       US9  O[A        USL a  U OU [        U5      S US9n SnU H  n[G        UU5      (       d  M  Sn  O   U(       a  [        R                  SUW5        GM$  [I        USL a  SO
[        U5      U[#        X-  5      5      nU=(       a    USL=(       a    [        U5      U:  n U (       a  [        R)                  SU5        [#        [        U5      S-  5      n!U!S:  a  Sn!Sn"/ n#/ n$U H  n%U U%U%U-    n&U(       a
  USL a  UU&-   n&U&RK                  USS9n'U(       at  U%S:  an  U U%   S :  ae  US!:  a  S!OUn(U(       aT  U'SU( U;  aK  [I        U%U%S-
  S"5       H7  n)U U)U%U-    n&U(       a
  USL a  UU&-   n&U&RK                  USS9n'U'SU( U;   d  M7    O   U#R-                  U'5        U$R-                  [M        U'U5      5        U$S"   U:  a  U"S-  n"U"U!:  d  U(       d  M  USL d  M    O   U$(       a  [O        U$5      [        U$5      -  n*OSn*U*U:  d  U"U!:  ad  UR-                  U5        [        R                  S#UU"[Q        U*S$-  S%S&95        USSU4;   a$  [        U UUS/ U5      n+UU:X  a  U+nOUS:X  a  U+nOU+nGM>  [        R)                  S'U[Q        U*S$-  S%S&95        U(       d  [S        U5      n,O[U        U5      n,U,(       a.  [        R)                  S(R	                  U[A        U,5      5      5        / n-U# H9  n'[W        U'S)U,(       a  S*R                  U,5      OS5      n.U-R-                  U.5        M;     [Y        U-5      n/U/(       a%  [        R)                  S+R	                  U/U5      5        UR-                  [        U UU*UU/U5      5        UUSS4;   a-  U*S):  a'  [        R)                  S,U5        [        UU   /5      s  $ UU:X  d  GM  [        R)                  S-U5        [        UU   /5      s  $    [        U5      S:X  a  U(       d  U(       d  U(       a  [        R                  S.5        U(       a3  [        R                  S/URZ                  5        UR-                  U5        U$ U(       a  Ub+  U(       a!  U(       a  UR\                  UR\                  :w  d  Ub(  [        R                  S05        UR-                  U5        U$ U(       a&  [        R                  S15        UR-                  U5        U$ s  sn	f s  sn	f ! [:        [<        4 a    [        R?                  SU5         GM  f = f! [B        [D        4 aR  n[        U[D        5      (       d   [        R                  SU[A        U5      5        UR-                  U5         SnAGMF  SnAff = f)2a  
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.

You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.

This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
z4Expected object of type bytes or bytearray, got: {0}r   zXGiven content is empty, stopping the process very early, returning empty utf_8 str matchutf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r
   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).Tz@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      ignore)errors      zc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z0%s is most likely the one. Stopping the process.z[%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z#%s will be used as a fallback matchz&utf_8 will be used as a fallback matchz&ascii will be used as a fallback match)/
isinstance	bytearraybytes	TypeErrorformattypeloggersetLevelloggingCRITICALINFOlenwarningr   r   joinr   intr   r   infor   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrordebugstrUnicodeDecodeErrorLookupErrorr   rangedecoder   sumroundr   r   r   r   r+   fingerprint)0r   r   r   r   r    r!   r"   r#   lengthcpis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedresultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_count	md_chunks	md_ratiosicut_sequencechunkchunk_partial_size_chkjmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergeds0                                                   1platform/bq/third_party/charset_normalizer/api.py
from_bytesr~   &   s[	   2 i)U!344BIIY
 	
 (()%^F{f	
 |IwUBPRSTUU5IIl#	

 8DD|	"e,|D6IIl#	

 8DD|	"e,|Du$%l		
 
qyV^j0(
	N-??	N.>>LSS	

 
W^^	
  .BT-Iy)t  %$$%78N	

 UF  NKG 3I >L+$$\2W	
   )++$$W-.?M=M\9F"

=!+}</ 
4K5
 005IU5RKKm 	$:=$I!	$)>%)G'50 kD	*"3{#3c$i@*	 #&'50 "3{#3#56*	#  %*!$; ],@AA,0) %<
 %NNi$
 %.AC4D
 " .t+.O$v- 	 KK-  B!,q  !		A$QZ8L#(8E(A*\9 ''h'GE %Q9Q<43G %r/Bz '
 $556oM"1a!eR0'0Q^'D/4D4M+6+EL , 3 3M( 3 S !8"89_L! 1 U#Zy9:}	) A%  $55$$)9U)BS V !)ns9~=O!Oi'+;?P+P#**=9NN0 o+Q7 '3E FF!-}iO" !$66)7&"g-%3N"0KK/C'3	
 %1-@4]CKK8??!3'7#8 	E-s:JCHH%56PTO _-  2)<KK299$m 	$ 		
 0'7CC#%KKBM "7=#9":;;L(KKm "7=#9":;;U @X 7|q.,>NNa NN57I7R7R NN-.  N ^3"++~/I/II'NNCDNN;'
 N	 NNCDNN>*No
 E EF $[1 	LLDm 		* #K0 	a--O!F
 $**=9	s=   9b bb*>c)c&c
	c
d/Ad**d/fpc           
      B    [        U R                  5       UUUUUUU5      $ )zz
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
)r~   read)r   r   r   r   r    r!   r"   r#   s           r}   from_fpr     s/     
		 	    pathc                 t    [        U S5       n[        UUUUUUUU5      sSSS5        $ ! , (       d  f       g= f)z
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
rbN)openr   )	r   r   r   r   r    r!   r"   r#   r   s	            r}   	from_pathr     s<     
dD	R 	
 
		s   )
7c           
         [        U UUUUUU5      n[        U 5      n[        [        U5      5      n	[	        U5      S:X  a  [        SR                  U5      5      eUR                  5       n
U	S==   SU
R                  -   -  ss'   [        SR                  [        U 5      R                  USR                  U	5      5      5      S5       nUR                  U
R                  5       5        SSS5        U
$ ! , (       d  f       U
$ = f)za
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r'   wbN)r   r   listr   rB   IOErrorr;   bestr+   r   rM   replacerD   writeoutput)r   r   r   r   r    r!   r"   ra   filenametarget_extensionsresultr   s               r}   	normalizer     s     G ~HXh/0
7|qIPP
 	
 \\^FaC&//11	CI%%h8I0JKLd
	
!

 M
 

 Ms    C11
D )      皙?NNTF)r   r   r   NNT)3os.pathr   r   typingr   r   r   r   osr	   rK   rM   r?   cdr   r   r   r   constantr   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerr=   r>   DEBUGStreamHandlerhandlersetFormatter	Formatter
addHandlerr9   rE   floatboolr~   r   r   r    r   r}   <module>r      s   & 0 0   K J  0  
		/	0  



!   W&&'RS T   ' 
 ""!%JJJ J 	J
 s)J s)J J J J^ ""!%  	
 s) s)   8 ""!%



 
 	

 s)
 s)
 
 
 
: ""!%)
)) ) 	)
 s)) s)) ) )C  Hs   E E&%E&