detect UTF encodings when loading json

This commit is contained in:
David Lord 2018-04-10 09:29:48 -07:00
parent 0cbe698958
commit ab4142215d
No known key found for this signature in database
GPG key ID: 7A1C87E3F5BC42A8
4 changed files with 82 additions and 26 deletions

View file

@ -6,7 +6,7 @@ flask.json
:copyright: © 2010 by the Pallets team.
:license: BSD, see LICENSE for more details.
"""
import codecs
import io
import uuid
from datetime import date, datetime
@ -121,6 +121,49 @@ def _load_arg_defaults(kwargs):
kwargs.setdefault('cls', JSONDecoder)
def detect_encoding(data):
"""Detect which UTF codec was used to encode the given bytes.
The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is
accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big
or little endian. Some editors or libraries may prepend a BOM.
:param data: Bytes in unknown UTF encoding.
:return: UTF encoding name
"""
head = data[:4]
if head[:3] == codecs.BOM_UTF8:
return 'utf-8-sig'
if b'\x00' not in head:
return 'utf-8'
if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE):
return 'utf-32'
if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
return 'utf-16'
if len(head) == 4:
if head[:3] == b'\x00\x00\x00':
return 'utf-32-be'
if head[::2] == b'\x00\x00':
return 'utf-16-be'
if head[1:] == b'\x00\x00\x00':
return 'utf-32-le'
if head[1::2] == b'\x00\x00':
return 'utf-16-le'
if len(head) == 2:
return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le'
return 'utf-8'
def dumps(obj, **kwargs):
"""Serialize ``obj`` to a JSON formatted ``str`` by using the application's
configured encoder (:attr:`~flask.Flask.json_encoder`) if there is an
@ -155,7 +198,10 @@ def loads(s, **kwargs):
"""
_load_arg_defaults(kwargs)
if isinstance(s, bytes):
s = s.decode(kwargs.pop('encoding', None) or 'utf-8')
encoding = kwargs.pop('encoding', None)
if encoding is None:
encoding = detect_encoding(s)
s = s.decode(encoding)
return _json.loads(s, **kwargs)