Compression corruption

From Wikitech

The Latin-1 to UTF-8 dump converter that's been used on various wikis unrecoverably corrupts any compressed old revisions that happen to be in the dump. If any accidentally sneak in, they need to be recovered from a pre-conversion backup.

This script (run it from the maintenance subdirectory) will pull compressed items from a pre-conversion copy of the database, and output SQL statements to be run on the post-conversion database to fix the corrupted entries.

(Not very robust; assumes that the old_flags field is still set on the other side. Makes a check for entries that were already in UTF-8 because there were some, not sure what's up with that.)

--Brion 09:34, 28 Nov 2004 (UTC)

<?php

# Find compressed entries, convert them from latin-1 to UTF-8,
# recompress, and write out SQL to update corrupted entries
# in the database with the corrected data.
#
# brion vibber 2004-11-28
# for recovery of es.wiktionary.org data

require_once( "commandLine.inc" );
require_once( "compressOld.inc" );

if( !function_exists( "gzdeflate" ) ) {
	print "You must enable zlib support in PHP to compress old revisions!\n";
	print "Please see http://www.php.net/manual/en/ref.zlib.php\n\n";
	die();
}

fixThemUp();

print "Done.\n";
exit();

function fixThemUp() {
	$sql = "SELECT * FROM old WHERE old_flags like '%gzip%'";
	$res = wfQuery( $sql, DB_READ );
	$n = 0;
	$already = 0;
	$out = fopen( "fix-eswiktionary.sql", "wt" );
	fwrite( $out, "-- Fixups for eswiktionary compression corruption, 2004-11-28\n" );
	
	while( $row = wfFetchObject( $res ) ) {
		$decomp = gzinflate( $row->old_text );
		if( $decomp === false ) {
			die("urp: {$row->old_id}\n");
		}
		
		$ascii = !preg_match( '/[\x80-\xff]/', $decomp );
		if( $row->old_id == 1235 ) {
			$utf8 = false; # THIS segfaults i'm so sad
		} else {
			$utf8 = preg_match( '/^(?:[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
		            '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/',
		            $decomp );
		}
		$guess = $utf8 && !$ascii && strlen($row->old_text);
		if($guess) {
			$already++;
		}
		$guesstext = $guess ? "UTF-8 already" : "converted";
		printf( "%d %d:%s '%s' %s\n",
			$row->old_id,
			$row->old_namespace,
			$row->old_title,
			$row->old_flags,
			$guesstext);
		$n++;
		
		if( $guess ) {
			$recompress = $this->old_text;
		} else {
			$converted = iconv( "CP1252", "UTF-8", $decomp );
			#$converted = mb_convert_encoding($decomp, "UTF-8", "CP1252");
			$recompress = gzdeflate( $converted );
		}
		fwrite( $out, "-- {$row->old_namespace}:{$row->old_title}, {$row->old_timestamp}, $guesstext\n" );
		$line = sprintf(
		  "UPDATE old SET old_text='%s' WHERE old_id=%d LIMIT 1;\n",
		  wfStrencode( $recompress ),
		  $row->old_id,
		  wfStrencode( $row->old_text ) );
		fwrite( $out, $line );
	}
	fwrite( $out, "-- $n entries done.\n" );
	fclose( $out );
	echo "$n entries, $already look like UTF-8.\n";
	wfFreeResult( $res );
}
?>