from django import forms
from django.utils.translation import ugettext_lazy as _
class UploadedFileInMemoryError(Exception):
pass
class DocField(forms.FileField):
"""
This is form field for PDF or Microsoft Word Document (both .doc and .docx)
It will validate the file uploaded as a valid PDF and MS Word Document.
~~~~~~~~~
Usage:
import DocField
doc = models.DocField()
~~~~~~~~~
It extends a forms.FileField, so you can put all the arguments relevant to FileField.
IMPORTANT NOTE: The method of validation is actually run thru *nix OS shell command 'file',
therefore, 1. only *nix system can use this class.
2. The file uploaded must be saved on disk, meaning you need to set your upload handler to use TempoaryFileUploadHandler Only.
# (i.e. put this in your settings.py)
FILE_UPLOAD_HANDLERS = (
"django.core.files.uploadhandler.TemporaryFileUploadHandler",
)
"""
default_error_messages = {
'invalid': _(u"No file was submitted. Check the encoding type on the form."),
'missing': _(u"No file was submitted."),
'empty': _(u"The submitted file is empty."),
'not_doc': _(u"Upload a valid document. The file you uploaded was not a acceptable document or a corrupted document."),
}
def clean(self, data, initial=None):
super(DocField, self).clean(initial or data)
#before save check if the writing sample is valid
import os, re
from django.forms.util import ValidationError
match = r'PDF document|Microsoft Office Document|Zip archive data'
if hasattr(data, 'temporary_file_path'):
file = data.temporary_file_path()
else:
# throw an error because uploaded file in memory
raise UploadedFileInMemoryError('The file uploaded is stored in memory instead of disk and the validation cannot be performed.')
out = os.popen('file %s' % file)
ck = re.search(match, out.read())
if ck == None:
raise ValidationError(self.error_messages['not_doc'])
# check further for docx file as it's zip file
if ck.group(0)[0] == 'Z':
import zipfile
docx = 'word/document.xml'
if not zipfile.is_zipfile(file):
raise ValidationError(self.error_messages['not_doc'])
zf = zipfile.ZipFile(file)
if not docx in zf.namelist():
raise ValidationError(self.error_messages['not_doc'])
return data
Comments
In an effort to make this snippet portable, wouldn't it make more sense to look for the magic pattern in the file to identify it as a PDF or doc file?
For example, with PDF, make sure the file begins with "%PDF-" For a Word document, check that it being with "\x31\xbe\x00\x00" or "PO^Q`"
Those are merely taken from a /etc/gnome-vfg-mime-magic file on a Fedora 8 box, but given that specifications for both file formats are now openly available, I'm sure you can verify the true file magic necessary to identify these files.
#
Older Office documents are identified by:
So that should be added to the list.
#