import pytest from charset_normalizer.utils import any_specified_encoding from charset_normalizer import CharsetMatch @pytest.mark.parametrize( "payload, expected_encoding", [ (b'', "euc_jp"), (b'', "utf_8"), (b'', None), (b'# coding: utf-8', "utf_8"), (b'', 'utf_8'), (b'', 'ascii'), (b'', 'johab'), (b'', 'cp037'), (b'', "cp1252"), (b'', "cp1256"), ] ) def test_detect_most_common_body_encoding(payload, expected_encoding): specified_encoding = any_specified_encoding( payload ) assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body" @pytest.mark.parametrize( "payload, expected_outcome", [ (b'', b''), (b'', b''), (b'', b''), (b'# coding: utf-8', b'# coding: utf-8'), (b'', b''), (b'', b''), (b'', b''), (b'', b''), (b'', b''), ] ) def test_preemptive_mark_replacement(payload, expected_outcome): """ When generating (to Unicode converted) bytes, we want to change any potential declarative charset to utf-8. This test that. """ specified_encoding = any_specified_encoding( payload ) detected_encoding = specified_encoding if specified_encoding is not None else "utf-8" m = CharsetMatch( payload, detected_encoding, 0., False, [], preemptive_declaration=specified_encoding, ) transformed_output = m.output() assert transformed_output == expected_outcome