egroupware_official/doc/ldap/fix-ldap-charset-for-egw1.1.pl

191 lines
7.0 KiB
Perl
Executable File

#!/usr/bin/perl -w
use strict;
use MIME::Base64;
use Text::Iconv;
#**************************************************************************
# fix-ldap-charset-for-egw1.1.pl - description *
# ------------------- *
# begin : Mon 2005/08/08 *
# copyright : (C) 2005 by Carsten Wolff *
# email : wolffc@egroupware.org *
# *
# This program is free software; you can redistribute it and/or modify *
# it under the terms of the GNU General Public License as published by *
# the Free Software Foundation; either version 2 of the License, or *
# (at your option) any later version. *
# *
# *
# This script is used to adapt the charset in an egw ldap addressbook *
# to the egw code in Release 1.1 and newer. *
# *
# *
# The old egw code just called utf8_encode on every attribute before *
# writing and utf8_decode after reading an ldap attribute. This was *
# fine as long as egw was run in iso8859-1, because then, calling *
# utf8_encode was a proper conversion. *
# But since egw supported systemcharsets, this call led to strings *
# being encoded _twice_ before they were sent to ldap and thus being *
# encoded in some weired mix of 2 charsets. *
# This of course confuses other LDAP-clients, because they don't *
# know about the actual charset of the data anymore. *
# The new egw code now correctly _converts_ from every charset to utf8 *
# before sending data to ldap and converts from utf8 to systemcharset *
# on reading. This of course makes it necessary, to correct the charset *
# of existing entries in the ldap-branch used by egw-addressbook *
# (i.e. really make them utf-8), before the new code is being used. *
# *
# How to use this script: *
# 1. make a datadump of your ldap database (f.e. slapcat>data.ldif) *
# 2. configure this script below *
# 3. convert the dump (./fix-ldap-charset-for-egw1.1.pl data.ldif) *
# 4. reimport the dump (f.e. slapadd -l data.ldif.conv) *
# *
#**************************************************************************
##############################################################################
# CONFIGURATION - BEGIN
#
#
# only entries below this DN will be converted
my $basedn = "ou=addressbook,dc=domain,dc=xyz";
# this is the systemcharset of eGW, that was used at the time
# when the eGW-Code of your installation still was version 1.0.x or earlier
my $egw_systemcharset = "utf-8";
#
#
# CONFIGURATION - END
##############################################################################
# parameters
my $filename = $ARGV[0];
unless (-f $filename) {
print "usage: " . $0 . " {ldif-filename}\n";
exit 0;
}
# global objects
my $iconv_outer = Text::Iconv->new("utf-8", "iso-8859-1");
my $iconv_inner = Text::Iconv->new($egw_systemcharset, "utf-8");
# get an array of all entries
local $/; # slurp mode
open(FOLD, "< $filename\0") || die "error opening source-file: $filename: $!";
flock(FOLD, 2);
my $file = <FOLD>;
my @old = split("\n\n",$file);
flock(FOLD, 8);
close(FOLD);
print "\nRead " . $#old . " entries from " . $filename . "\n";
# begin with conversion
my @new = ();
my $i = 0;
foreach my $oldentry (@old) {
my $workentry = $oldentry;
# concatenate base64 multline data
$workentry =~ s/\n //g;
# extract the raw DN and get it's readable form
$workentry =~ /^(dn:[^\n]*)\n/;
my %dn = getAttributeValue($1);
# check, if this entry is to be converted
my $basednregexp = regexpEscape($basedn);
unless ($dn{'value'} =~ /^.+$basednregexp$/) {
push(@new, $oldentry . "\n");
next;
}
#
# This entry is to be converted
#
my $newentry = "";
my @attributes = split("\n", $workentry);
foreach my $attr (@attributes) {
my %attrib = getAttributeValue($attr);
$attrib{'value'} = $iconv_inner->convert($iconv_outer->convert($attrib{'value'}));
$newentry .= attrib2ldif(\%attrib);
}
push(@new,$newentry);
$i++;
}
print "Converted $i entries in $basedn\n";
# write the result
open(FNEW, "> $filename" . ".conv\0") || die "error opening destination-file: $filename" . ".conv: $!";
flock(FNEW, 2);
foreach(@new) {
print FNEW $_ . "\n";
}
flock(FNEW, 8);
close(FNEW);
print "Wrote $#new entries to $filename.conv\n\nPlease check the number of entries and have a look at\n$filename.conv, before reimporting it.\n\n";
#####################
# Subroutines
#####################
# break down an attribute in attribute-name and value
# if the value is base64, decode it.
sub getAttributeValue {
my ($rawattr) = @_;
my %attr = ();
if ($rawattr =~ /^([^:]*):: (.*)/) {
$attr{'name'} = $1;
$attr{'value'} = decode_base64($2);
} elsif ($rawattr =~ /^([^:]*): (.*)/) {
$attr{'name'} = $1;
$attr{'value'} = $2;
} else {
print "Error extracting data from attribute: " . $rawattr . "\n";
}
return %attr;
}
# escape a string for use within a regexp
sub regexpEscape {
my ($string) = @_;
$string =~ s/([\^\.\$\|\(\)\[\]\*\+\?\{\}])/\\$1/g;
return $string;
}
# cahnge an attribute in suitable form for an ldif
sub attrib2ldif {
my ($attrib) = @_;
my ($key, $value) = ($attrib->{'name'}, $attrib->{'value'});
# RFC2894 requires a string to be BASE64 encoded, if
# - it begins with a char that's not a SAFE-INIT-CHAR
# - or it contains a char that's not a SAFE-CHAR
if ($value =~ /^[: <]/ or $value =~ /[^\x01-\x09\x0b-\x0c\x0e-\x7f]/) {
# email-addresses can not contain unicode-characters
if ($key eq "mail" or $key eq "phpgwMailHome") {
print "Warning: forbidden characters in eMail-address detected: " . $value . "\n";
}
$value = encode_base64($value);
$value =~ s/\n//g;
# each line has to be no more than 77 characters long
# including a leading space and, on the first line, the key.
# Exceptions: dn and rdn
unless ($key eq "dn" or $key eq "rdn") {
my $keylen = length($key) + 3;
my $form = substr($value, 0, 77 - $keylen);
unless ($form eq $value) {
my $j = 0;
my $next = "";
do {
$next = substr($value, 77 - $keylen + $j * 76, 76);
$form .= "\n " . $next;
$j++;
} until (length($next) < 76);
}
$value = $form;
}
$key = $key . ":: ";
} else {
$key = $key . ": ";
}
return $key . $value . "\n";
}