Improved YAML serializer for large databases

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
Improved YAML serializer by rspeer@mit.edu. Uses a stream of documents so that
it doesn't have to keep all database entries in memory.

Requires PyYaml (http://pyyaml.org/), but that's checked for in __init__.

To use it, add a line like this to your settings.py::
  
  SERIALIZATION_MODULES = {
      'yaml': 'path.to.import.this.module'
  }
"""

from StringIO import StringIO
import yaml
from django.utils.encoding import smart_unicode

try:
    import decimal
except ImportError:
    from django.utils import _decimal as decimal # Python 2.3 fallback

from django.db import models
from django.core.serializers.python import Serializer as PythonSerializer
from django.core.serializers.python import Deserializer as PythonDeserializer

class DjangoSafeDumper(yaml.SafeDumper):
    def represent_decimal(self, data):
        return self.represent_scalar('tag:yaml.org,2002:str', str(data))

DjangoSafeDumper.add_representer(decimal.Decimal, DjangoSafeDumper.represent_decimal)

class Serializer(PythonSerializer):
    """
    Convert a queryset to YAML.
    """
    
    internal_use_only = False
    
    def handle_field(self, obj, field):
        # A nasty special case: base YAML doesn't support serialization of time
        # types (as opposed to dates or datetimes, which it does support). Since
        # we want to use the "safe" serializer for better interoperability, we
        # need to do something with those pesky times. Converting 'em to strings
        # isn't perfect, but it's better than a "!!python/time" type which would
        # halt deserialization under any other language.
        if isinstance(field, models.TimeField) and getattr(obj, field.name) is not None:
            self._current[field.name] = str(getattr(obj, field.name))
        else:
            super(Serializer, self).handle_field(obj, field)
    
    def end_object(self, obj):
        the_object = {
            "model"  : smart_unicode(obj._meta),
            "pk"     : smart_unicode(obj._get_pk_val(), strings_only=True),
            "fields" : self._current
        }
        self._current = None
        dumpstr = yaml.dump(the_object, Dumper=DjangoSafeDumper,
        explicit_start=True, **self.options)
        self.stream.write(dumpstr)

    def start_serialization(self):
        self.options.pop('stream', None)
        self.options.pop('fields', None)
        PythonSerializer.start_serialization(self)

    def end_serialization(self):
        self.stream.close()

    def getvalue(self):
        return self.stream.getvalue()

def Deserializer(stream_or_string, **options):
    """
    Deserialize a stream or string of YAML data.
    """
    if isinstance(stream_or_string, basestring):
        stream = StringIO(stream_or_string)
    else:
        stream = stream_or_string
    for obj in PythonDeserializer(yaml.load_all(stream)):
        yield obj

More like this

  1. streaming serializer by kcarnold 4 years, 1 month ago
  2. Character encoding fix by mrtron 5 years ago
  3. Timedelta Database Field by guettli 4 years, 8 months ago
  4. utf8-friendly dumpdata management command for yaml (no escape symbols) by mucius 2 days, 1 hour ago
  5. A slightly better YAML serializer by wapcaplet 1 year, 11 months ago

Comments

(Forgotten your password?)