egroupware/phpgwapi/doc/ldap/fix-ldap-charset-for-egw1.1.pl

#!/usr/bin/perl -w

use strict;
use MIME::Base64;
use Text::Iconv;

 #**************************************************************************
 #               fix-ldap-charset-for-egw1.1.pl  -  description            *
 #                             -------------------                         *
 #    begin                : Mon 2005/08/08                                *
 #    copyright            : (C) 2005 by Carsten Wolff                     *
 #    email                : wolffc@egroupware.org                         *
 #                                                                         *
 #   This program is free software; you can redistribute it and/or modify  *
 #   it under the terms of the GNU General Public License as published by  *
 #   the Free Software Foundation; either version 2 of the License, or     *
 #   (at your option) any later version.                                   *
 #                                                                         *
 #                                                                         *
 #   This script is used to adapt the charset in an egw ldap addressbook   *
 #   to the egw code in Release 1.1 and newer.                             *
 #                                                                         *
 #                                                                         *
 #   The old egw code just called utf8_encode on every attribute before    *
 #   writing and utf8_decode after reading an ldap attribute. This was     *
 #   fine as long as egw was run in iso8859-1, because then, calling       *
 #   utf8_encode was a proper conversion.                                  *
 #   But since egw supported systemcharsets, this call led to strings      *
 #   being encoded _twice_ before they were sent to ldap and thus being    *
 #   encoded in some weired mix of 2 charsets.                             *
 #   This of course confuses other LDAP-clients, because they don't        *
 #   know about the actual charset of the data anymore.                    *
 #   The new egw code now correctly _converts_ from every charset to utf8  *
 #   before sending data to ldap and converts from utf8 to systemcharset   *
 #   on reading. This of course makes it necessary, to correct the charset *
 #   of existing entries in the ldap-branch used by egw-addressbook        *
 #   (i.e. really make them utf-8), before the new code is being used.     *
 #                                                                         *
 #   How to use this script:                                               *
 #   1. make a datadump of your ldap database (f.e. slapcat>data.ldif)     *
 #   2. configure this script below                                        *
 #   3. convert the dump (./fix-ldap-charset-for-egw1.1.pl data.ldif)      *
 #   4. reimport the dump (f.e. slapadd -l data.ldif.conv)                 *
 #                                                                         *
 #**************************************************************************

##############################################################################
# CONFIGURATION - BEGIN
#
#
# only entries below this DN will be converted
my $basedn = "ou=addressbook,dc=domain,dc=xyz";
# this is the systemcharset of eGW, that was used at the time
# when the eGW-Code of your installation still was version 1.0.x or earlier
my $egw_systemcharset = "utf-8";
#
#
# CONFIGURATION - END
##############################################################################


# parameters
my $filename = $ARGV[0];
unless (-f $filename) {
	print "usage: " . $0 . " {ldif-filename}\n";
	exit 0;
}

# global objects
my $iconv_outer = Text::Iconv->new("utf-8", "iso-8859-1");
my $iconv_inner = Text::Iconv->new($egw_systemcharset, "utf-8");

# get an array of all entries
local $/;  # slurp mode
open(FOLD, "< $filename\0") || die "error opening source-file: $filename: $!";
flock(FOLD, 2);
my $file = <FOLD>;
my @old = split("\n\n",$file);
flock(FOLD, 8);
close(FOLD);

print "\nRead " . $#old . " entries from " . $filename . "\n";

# begin with conversion
my @new = ();
my $i = 0;
foreach my $oldentry (@old) {
	my $workentry = $oldentry;
	# concatenate base64 multline data
	$workentry =~ s/\n //g;
	# extract the raw DN and get it's readable form
	$workentry =~ /^(dn:[^\n]*)\n/;
	my %dn = getAttributeValue($1);
	# check, if this entry is to be converted
	my $basednregexp = regexpEscape($basedn);
	unless ($dn{'value'} =~ /^.+$basednregexp$/) {
		push(@new, $oldentry . "\n");
		next;
	}
	#
	# This entry is to be converted
	#
	my $newentry = "";
	my @attributes = split("\n", $workentry);
	foreach my $attr (@attributes) {
		my %attrib = getAttributeValue($attr);
		$attrib{'value'} = $iconv_inner->convert($iconv_outer->convert($attrib{'value'}));
		$newentry .= attrib2ldif(\%attrib);
	}
	push(@new,$newentry);
	$i++;
}
print "Converted $i entries in $basedn\n";

# write the result
open(FNEW, "> $filename" . ".conv\0") || die "error opening destination-file: $filename" . ".conv: $!";
flock(FNEW, 2);
foreach(@new) {
	print FNEW $_ . "\n";
}
flock(FNEW, 8);
close(FNEW);

print "Wrote $#new entries to $filename.conv\n\nPlease check the number of entries and have a look at\n$filename.conv, before reimporting it.\n\n";

#####################
# Subroutines
#####################

# break down an attribute in attribute-name and value
# if the value is base64, decode it.
sub getAttributeValue {
	my ($rawattr) = @_;
	my %attr = ();
	if ($rawattr =~ /^([^:]*):: (.*)/) {
		$attr{'name'} = $1;
		$attr{'value'} = decode_base64($2);
	} elsif ($rawattr =~ /^([^:]*): (.*)/) {
		$attr{'name'} = $1;
		$attr{'value'} = $2;
	} else {
		print "Error extracting data from attribute: " . $rawattr . "\n";
	}
	return %attr;
}

# escape a string for use within a regexp
sub regexpEscape {
	my ($string) = @_;
	$string =~ s/([\^\.\$\|\(\)\[\]\*\+\?\{\}])/\\$1/g;
	return $string;
}

# cahnge an attribute in suitable form for an ldif
sub attrib2ldif {
	my ($attrib) = @_;
	my ($key, $value) = ($attrib->{'name'}, $attrib->{'value'});
	# RFC2894 requires a string to be BASE64 encoded, if
	# - it begins with a char that's not a SAFE-INIT-CHAR
	# - or it contains a char that's not a SAFE-CHAR
	if ($value =~ /^[: <]/ or $value =~ /[^\x01-\x09\x0b-\x0c\x0e-\x7f]/) {
		# email-addresses can not contain unicode-characters
		if ($key eq "mail" or $key eq "phpgwMailHome") {
			print "Warning: forbidden characters in eMail-address detected: " . $value . "\n";
		}
		$value = encode_base64($value);
		$value =~ s/\n//g;
		# each line has to be no more than 77 characters long
		# including a leading space and, on the first line, the key.
		# Exceptions: dn and rdn
		unless ($key eq "dn" or $key eq "rdn") {
			my $keylen = length($key) + 3;
			my $form = substr($value, 0, 77 - $keylen);
			unless ($form eq $value) {
				my $j = 0;
				my $next = "";
				do  {
					$next = substr($value, 77 - $keylen + $j * 76, 76);
					$form .= "\n " . $next;
					$j++;
				} until (length($next) < 76);
			}
			$value = $form;
		}
		$key = $key . ":: ";
	} else {
		$key = $key . ": ";
	}
	return $key . $value . "\n";
}
fix the handling of charsets in the contacts_ldap class and provide admins with a script, that helps to fix old, corrupted eGW-LDAP-addressbooks. For a description of why and when this is necessary, please read the header of the fix-ldap-charset-for-egw1.1.pl script. This should appear in the release-notes for eGW 1.1. 2005-08-13 23:01:14 +02:00			`#!/usr/bin/perl -w`

			`use strict;`
			`use MIME::Base64;`
			`use Text::Iconv;`

			`#**************************************************************************`
			`# fix-ldap-charset-for-egw1.1.pl - description *`
			`# ------------------- *`
			`# begin : Mon 2005/08/08 *`
			`# copyright : (C) 2005 by Carsten Wolff *`
			`# email : wolffc@egroupware.org *`
			`# *`
			`# This program is free software; you can redistribute it and/or modify *`
			`# it under the terms of the GNU General Public License as published by *`
			`# the Free Software Foundation; either version 2 of the License, or *`
			`# (at your option) any later version. *`
			`# *`
			`# *`
			`# This script is used to adapt the charset in an egw ldap addressbook *`
			`# to the egw code in Release 1.1 and newer. *`
			`# *`
			`# *`
			`# The old egw code just called utf8_encode on every attribute before *`
			`# writing and utf8_decode after reading an ldap attribute. This was *`
			`# fine as long as egw was run in iso8859-1, because then, calling *`
			`# utf8_encode was a proper conversion. *`
			`# But since egw supported systemcharsets, this call led to strings *`
			`# being encoded _twice_ before they were sent to ldap and thus being *`
			`# encoded in some weired mix of 2 charsets. *`
			`# This of course confuses other LDAP-clients, because they don't *`
			`# know about the actual charset of the data anymore. *`
			`# The new egw code now correctly _converts_ from every charset to utf8 *`
			`# before sending data to ldap and converts from utf8 to systemcharset *`
			`# on reading. This of course makes it necessary, to correct the charset *`
			`# of existing entries in the ldap-branch used by egw-addressbook *`
			`# (i.e. really make them utf-8), before the new code is being used. *`
			`# *`
			`# How to use this script: *`
			`# 1. make a datadump of your ldap database (f.e. slapcat>data.ldif) *`
			`# 2. configure this script below *`
			`# 3. convert the dump (./fix-ldap-charset-for-egw1.1.pl data.ldif) *`
			`# 4. reimport the dump (f.e. slapadd -l data.ldif.conv) *`
			`# *`
			`#**************************************************************************`

			`##############################################################################`
			`# CONFIGURATION - BEGIN`
			`#`
			`#`
			`# only entries below this DN will be converted`
			`my $basedn = "ou=addressbook,dc=domain,dc=xyz";`
			`# this is the systemcharset of eGW, that was used at the time`
			`# when the eGW-Code of your installation still was version 1.0.x or earlier`
			`my $egw_systemcharset = "utf-8";`
			`#`
			`#`
			`# CONFIGURATION - END`
			`##############################################################################`


			`# parameters`
			`my $filename = $ARGV[0];`
			`unless (-f $filename) {`
			`print "usage: " . $0 . " {ldif-filename}\n";`
			`exit 0;`
			`}`

			`# global objects`
			`my $iconv_outer = Text::Iconv->new("utf-8", "iso-8859-1");`
			`my $iconv_inner = Text::Iconv->new($egw_systemcharset, "utf-8");`

			`# get an array of all entries`
			`local $/; # slurp mode`
			`open(FOLD, "< $filename\0") \|\| die "error opening source-file: $filename: $!";`
			`flock(FOLD, 2);`
			`my $file = <FOLD>;`
			`my @old = split("\n\n",$file);`
			`flock(FOLD, 8);`
			`close(FOLD);`

			`print "\nRead " . $#old . " entries from " . $filename . "\n";`

			`# begin with conversion`
			`my @new = ();`
			`my $i = 0;`
			`foreach my $oldentry (@old) {`
			`my $workentry = $oldentry;`
			`# concatenate base64 multline data`
			`$workentry =~ s/\n //g;`
			`# extract the raw DN and get it's readable form`
			`$workentry =~ /^(dn:[^\n]*)\n/;`
			`my %dn = getAttributeValue($1);`
			`# check, if this entry is to be converted`
			`my $basednregexp = regexpEscape($basedn);`
			`unless ($dn{'value'} =~ /^.+$basednregexp$/) {`
			`push(@new, $oldentry . "\n");`
			`next;`
			`}`
			`#`
			`# This entry is to be converted`
			`#`
			`my $newentry = "";`
			`my @attributes = split("\n", $workentry);`
			`foreach my $attr (@attributes) {`
			`my %attrib = getAttributeValue($attr);`
			`$attrib{'value'} = $iconv_inner->convert($iconv_outer->convert($attrib{'value'}));`
			`$newentry .= attrib2ldif(\%attrib);`
			`}`
			`push(@new,$newentry);`
			`$i++;`
			`}`
			`print "Converted $i entries in $basedn\n";`

			`# write the result`
			`open(FNEW, "> $filename" . ".conv\0") \|\| die "error opening destination-file: $filename" . ".conv: $!";`
			`flock(FNEW, 2);`
			`foreach(@new) {`
			`print FNEW $_ . "\n";`
			`}`
			`flock(FNEW, 8);`
			`close(FNEW);`

			`print "Wrote $#new entries to $filename.conv\n\nPlease check the number of entries and have a look at\n$filename.conv, before reimporting it.\n\n";`

			`#####################`
			`# Subroutines`
			`#####################`

			`# break down an attribute in attribute-name and value`
			`# if the value is base64, decode it.`
			`sub getAttributeValue {`
			`my ($rawattr) = @_;`
			`my %attr = ();`
			`if ($rawattr =~ /^([^:]):: (.)/) {`
			`$attr{'name'} = $1;`
			`$attr{'value'} = decode_base64($2);`
			`} elsif ($rawattr =~ /^([^:]): (.)/) {`
			`$attr{'name'} = $1;`
			`$attr{'value'} = $2;`
			`} else {`
			`print "Error extracting data from attribute: " . $rawattr . "\n";`
			`}`
			`return %attr;`
			`}`

			`# escape a string for use within a regexp`
			`sub regexpEscape {`
			`my ($string) = @_;`
			`$string =~ s/([\^\.\$\\|\(\)\[\]\*\+\?\{\}])/\\$1/g;`
			`return $string;`
			`}`

			`# cahnge an attribute in suitable form for an ldif`
			`sub attrib2ldif {`
			`my ($attrib) = @_;`
			`my ($key, $value) = ($attrib->{'name'}, $attrib->{'value'});`
			`# RFC2894 requires a string to be BASE64 encoded, if`
			`# - it begins with a char that's not a SAFE-INIT-CHAR`
			`# - or it contains a char that's not a SAFE-CHAR`
			`if ($value =~ /^[: <]/ or $value =~ /[^\x01-\x09\x0b-\x0c\x0e-\x7f]/) {`
			`# email-addresses can not contain unicode-characters`
			`if ($key eq "mail" or $key eq "phpgwMailHome") {`
			`print "Warning: forbidden characters in eMail-address detected: " . $value . "\n";`
			`}`
			`$value = encode_base64($value);`
			`$value =~ s/\n//g;`
			`# each line has to be no more than 77 characters long`
			`# including a leading space and, on the first line, the key.`
			`# Exceptions: dn and rdn`
			`unless ($key eq "dn" or $key eq "rdn") {`
			`my $keylen = length($key) + 3;`
			`my $form = substr($value, 0, 77 - $keylen);`
			`unless ($form eq $value) {`
			`my $j = 0;`
			`my $next = "";`
			`do {`
			`$next = substr($value, 77 - $keylen + $j * 76, 76);`
			`$form .= "\n " . $next;`
			`$j++;`
			`} until (length($next) < 76);`
			`}`
			`$value = $form;`
			`}`
			`$key = $key . ":: ";`
			`} else {`
			`$key = $key . ": ";`
			`}`
			`return $key . $value . "\n";`
			`}`