# -*- coding: utf-8 -*-
## Copyright (C) 2009 CERN.
##
## This file is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Creates the minimal set of Unicode character ranges for valid XML 1.0 and 1.1
characters minus the compatibility changes"""
INCLUDE_XML10 = "#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] \
| [#x10000-#x10FFFF]"
EXCLUDE_XML10 = "[#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDEF], \
[#x1FFFE-#x1FFFF], [#x2FFFE-#x2FFFF], [#x3FFFE-#x3FFFF], \
[#x4FFFE-#x4FFFF], [#x5FFFE-#x5FFFF], [#x6FFFE-#x6FFFF], \
[#x7FFFE-#x7FFFF], [#x8FFFE-#x8FFFF], [#x9FFFE-#x9FFFF], \
[#xAFFFE-#xAFFFF], [#xBFFFE-#xBFFFF], [#xCFFFE-#xCFFFF], \
[#xDFFFE-#xDFFFF], [#xEFFFE-#xEFFFF], [#xFFFFE-#xFFFFF], \
[#x10FFFE-#x10FFFF]"
INCLUDE_XML11 = "[#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]"
EXCLUDE_XML11 = "[#x1-#x8], [#xB-#xC], [#xE-#x1F], [#x7F-#x84], [#x86-#x9F], \
[#xFDD0-#xFDDF], \
[#x1FFFE-#x1FFFF], [#x2FFFE-#x2FFFF], [#x3FFFE-#x3FFFF], \
[#x4FFFE-#x4FFFF], [#x5FFFE-#x5FFFF], [#x6FFFE-#x6FFFF], \
[#x7FFFE-#x7FFFF], [#x8FFFE-#x8FFFF], [#x9FFFE-#x9FFFF], \
[#xAFFFE-#xAFFFF], [#xBFFFE-#xBFFFF], [#xCFFFE-#xCFFFF], \
[#xDFFFE-#xDFFFF], [#xEFFFE-#xEFFFF], [#xFFFFE-#xFFFFF], \
[#x10FFFE-#x10FFFF]"
def cleanup ( value ):
"""Prepare string for conversion to hex ranges
@param value: String with ranges
@return: String with ranges"""
return value . replace ( '#' , '0' ). translate ( None , '[]' )
def list_to_range ( value ):
"""Convert a list of strings (ranges and not)
@param value: List of strings corresponding to hexadecimal numbers and
ranges
@return: List of numbers"""
result = []
for item in value :
if item . find ( '-' ) == - 1 :
result . append ( int ( item , 16 ))
else :
numbers = [ int ( hex_str , 16 ) for hex_str in item . split ( '-' )]
result . extend ( range ( numbers [ 0 ], numbers [ 1 ] + 1 ))
return result
def range_minus ( include_range , exclude_range ):
"""Subtract one range from another
@param include_range: String from http://www.w3.org/TR/xml/#charsets or
http://www.w3.org/TR/xml11/#charsets
@param exclude_range: Ditto
@return: String with hex numbers and ranges"""
include_range = cleanup ( include_range )
includes = include_range . split ( ' | ' )
exclude_range = cleanup ( exclude_range )
excludes = exclude_range . split ( ', ' )
include_numbers = list_to_range ( includes )
exclude_numbers = list_to_range ( excludes )
numbers = set ([
number for number
in include_numbers
if number not in exclude_numbers ])
lows = [
number for number
in numbers
if number - 1 not in numbers ]
highs = [
number for number
in numbers
if number + 1 not in numbers ]
result = zip ( lows , highs )
result_hex = [
' \\ U%0*X- \\ U%0*X' % ( 8 , pair [ 0 ], 8 , pair [ 1 ])
for pair in result ]
result_hex = [
text . replace ( '-' + text [: 10 ], '' )
for text in result_hex ] # Single ranges
result_hex = [
text . replace ( ' \\ U0000' , ' \\ u' )
for text in result_hex ] # Shorten where possible
return ' \n ' . join ( result_hex )
print 'XML 1.0: \n ' + range_minus ( INCLUDE_XML10 , EXCLUDE_XML10 ) + ' \n '
print 'XML 1.1: \n ' + range_minus ( INCLUDE_XML11 , EXCLUDE_XML11 )