Ever had to sort a file alphabetically, only to realize that you'd have to do it manually because every item that needs to be sorted is spread over more than one line? This just happened when I exported my
Gmail contacts to vCard, which it turned out were sorted by formatted name (FN) instead of name (N). The result was the following script, which takes two pattern and some input, and returns the sorted output. The example returned by
./sort_blocks.py --help is exactly the code to re-sort Gmail contacts. I'd love to know if you find any bugs or possible improvements to this script. Enjoy:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
## Copyright (C) 2009 CERN.
## Sort any multi-line block text
## This file is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""sort_blocks.py - Multiline sort of standard input
./sort_blocks.py -b 'pattern' -s 'pattern' < input_file > result_file
-v,--verbose Verbose mode
-h,--help Print this message
-b,--bp Block pattern (dotall multiline); used to extract blocks
-s,--sp Sort pattern (dotall multiline); extracted to sort blocks
./sort_blocks.py -b 'BEGIN:VCARD.*?END:VCARD \\ r \\ n' -s '^N:(.*)$' \
< contacts.vcf > contacts2.vcf
Orders vCards in contacts.vcf by name, and puts the results in contacts2.vcf."""
class Usage ( Exception ):
"""Raise in case of invalid parameters"""
def __init__ ( self , msg ):
self . msg = msg
def _compare_pattern ( sort_pattern , text1 , text2 ):
"""Function to sort by regex"""
matches = [
re . search ( sort_pattern , text , re . DOTALL | re . MULTILINE )
for text in [ text1 , text2 ]]
text_matches = 
for match in matches :
if match is None :
text_matches . append ( '' )
text_matches . append ( match . group ( 1 ))
return cmp ( text_matches [ 0 ], text_matches [ 1 ])
def split_and_sort ( text , block_pattern , sort_pattern ):
"""Split into blocks, sort them, and join them up again
@param text: String of blocks to sort
@param block_pattern: Regular expression corresponding to the border between
@param sort_pattern: Gets a subset of each block to sort by"""
text_blocks = re . findall ( block_pattern , text , re . DOTALL | re . MULTILINE )
text_blocks . sort ( lambda x , y : _compare_pattern ( sort_pattern , x , y ))
return '' . join ( text_blocks )
def main ( argv = None ):
if argv is None :
argv = sys . argv
block_pattern = ''
sort_pattern = ''
opts , args = getopt . getopt (
argv [ 1 :],
[ 'help' , 'bp=' , 'sp=' ])
except getopt . GetoptError , err :
raise Usage ( err . msg )
for option , value in opts :
if option in ( '-h' , '--help' ):
print ( __doc__ )
elif option in ( '-b' , '--bp' ):
block_pattern = value
elif option in ( '-s' , '--sp' ):
sort_pattern = value
raise Usage ( 'Unhandled option ' % option )
if block_pattern == '' or sort_pattern == '' or args :
raise Usage ( __doc__ )
text = sys . stdin . read ()
print split_and_sort ( text , block_pattern , sort_pattern )
except Usage , err :
sys . stderr . write ( err . msg + ' \n ' )
if __name__ == '__main__' :
sys . exit ( main ())