diff options
Diffstat (limited to 'AbuseFilter/maintenance/updateVarDumps.php')
-rw-r--r-- | AbuseFilter/maintenance/updateVarDumps.php | 700 |
1 files changed, 700 insertions, 0 deletions
diff --git a/AbuseFilter/maintenance/updateVarDumps.php b/AbuseFilter/maintenance/updateVarDumps.php new file mode 100644 index 00000000..a29e3553 --- /dev/null +++ b/AbuseFilter/maintenance/updateVarDumps.php @@ -0,0 +1,700 @@ +<?php + +use MediaWiki\Extension\AbuseFilter\AbuseFilterServices; +use MediaWiki\Extension\AbuseFilter\KeywordsManager; +use MediaWiki\MediaWikiServices; +use Wikimedia\AtEase\AtEase; +use Wikimedia\Rdbms\Database; +use Wikimedia\Rdbms\IResultWrapper; + +/** + * Performs several tasks aiming to update the stored var dumps for filter hits. + * See T213006 for a list. + * + * @ingroup Maintenance + */ +if ( getenv( 'MW_INSTALL_PATH' ) ) { + $IP = getenv( 'MW_INSTALL_PATH' ); +} else { + $IP = __DIR__ . '/../../..'; +} +require_once "$IP/maintenance/Maintenance.php"; + +class UpdateVarDumps extends LoggedUpdateMaintenance { + /** @var Database A connection to replica */ + private $dbr; + /** @var Database A connection to the master */ + private $dbw; + /** @var bool Whether we're performing a dry run */ + private $dryRun = false; + /** @var int Count of rows in the abuse_filter_log table */ + private $allRowsCount; + /** @var bool Whether to print progress markers */ + private $progressMarkers; + /** @var string|null */ + private $printOrphanedFile; + /** @var int|null How many seconds to sleep after each batch. */ + private $sleep; + /** @var KeywordsManager */ + private $keywordsManager; + + /** + * @inheritDoc + */ + public function __construct() { + parent::__construct(); + + $this->addDescription( 'Update AbuseFilter var dumps - T213006' ); + $this->addOption( 'dry-run-verbose', 'Perform a verbose dry run' ); + $this->addOption( 'dry-run', 'Perform a dry run' ); + $this->addOption( 'progress-markers', 'Print progress markers every 10 batches' ); + $this->addOption( + 'print-orphaned-records-to', + 'Print ExternalStore urls of orphaned ExternalStore records (if any) ' . + 'to the given file. Can use stdout, but it\'s not recommended for big databases.', + false, + true + ); + $this->addOption( 'sleep', 'Sleep this many seconds after each batch', false, true ); + $this->requireExtension( 'Abuse Filter' ); + $this->setBatchSize( 500 ); + $this->keywordsManager = AbuseFilterServices::getKeywordsManager(); + } + + /** + * @inheritDoc + */ + public function getUpdateKey() { + return __CLASS__; + } + + /** + * @inheritDoc + */ + public function doDBUpdates() { + if ( $this->hasOption( 'dry-run-verbose' ) || $this->hasOption( 'dry-run' ) ) { + // This way the script can be called with dry-run-verbose only and we can check for dry-run + $this->dryRun = true; + } + $this->progressMarkers = $this->hasOption( 'progress-markers' ); + $this->printOrphanedFile = $this->getOption( 'print-orphaned-records-to' ); + $this->sleep = $this->getOption( 'sleep' ); + + // Faulty rows aren't inserted anymore, hence we can query the replica and update the master. + $this->dbr = wfGetDB( DB_REPLICA ); + $this->dbw = wfGetDB( DB_MASTER ); + + // Control batching with the primary key to keep the queries performant and allow gaps + $this->allRowsCount = (int)$this->dbr->selectField( + 'abuse_filter_log', + 'MAX(afl_id)', + [], + __METHOD__ + ); + + if ( $this->allRowsCount === 0 ) { + $this->output( "...the abuse_filter_log table is empty.\n" ); + return !$this->dryRun; + } + + // Do the actual work. Note that several actions are superfluous (e.g. in fixMissingDumps + // we use "stored-text" but then we replace it in updateAflVarDump), but that's because of SRP. + + // First, ensure that afl_var_dump isn't empty + $this->fixMissingDumps(); + // Then, ensure that abuse_filter_log.afl_var_dump only contains "stored-text:xxxx" + $this->moveToText(); + // Then update the storage format in the text table + $this->updateText(); + // Finally, replace "stored-text:xxxx" with "tt:xxxx" for all rows + $this->updateAflVarDump(); + + return !$this->dryRun; + } + + /** + * Handle empty afl_var_dump. gerrit/16527 fixed a bug which caused an extra abuse_filter_log + * row to be inserted without the var dump for a given action. If we find a row identical to + * the current one but with a valid dump, just delete the current one. Otherwise, store a + * very basic var dump for sanity. + * This handles point 7. of T213006. + */ + private function fixMissingDumps() { + $this->output( "...Checking for missing dumps (1/4)\n" ); + $batchSize = $this->getBatchSize(); + + $prevID = 0; + $curID = $batchSize; + $deleted = $rebuilt = 0; + do { + $this->maybePrintProgress( $prevID ); + $brokenRows = $this->dbr->select( + 'abuse_filter_log', + '*', + [ + 'afl_var_dump' => '', + "afl_id > $prevID", + "afl_id <= $curID" + ], + __METHOD__, + [ 'ORDER BY' => 'afl_id ASC' ] + ); + $prevID = $curID; + $curID += $batchSize; + + $res = $this->doFixMissingDumps( $brokenRows ); + $deleted += $res['deleted']; + $rebuilt += $res['rebuilt']; + MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication(); + $this->maybeSleep(); + } while ( $prevID <= $this->allRowsCount ); + + if ( $this->dryRun ) { + $this->output( + "...found $deleted rows with blank afl_var_dump to delete, and " . + "$rebuilt rows to rebuild.\n" + ); + } else { + $this->output( + "...deleted $deleted rows with blank afl_var_dump, and rebuilt " . + "$rebuilt rows.\n" + ); + } + } + + /** + * @param IResultWrapper $brokenRows + * @return int[] + */ + private function doFixMissingDumps( IResultWrapper $brokenRows ) { + $deleted = 0; + foreach ( $brokenRows as $row ) { + if ( $row->afl_var_dump === '' ) { + $findRow = array_diff_key( + get_object_vars( $row ), + [ 'afl_var_dump' => true, 'afl_id' => true ] + ); + // This is the case where we may have a duplicate row. The wrong insertion happened + // right before the correct one, so their afl_id should only differ by 1, but let's + // play safe and only assume it's greater. Note that the two entries are guaranteed + // to have the same timestamp. + $findRow[] = 'afl_id > ' . $this->dbr->addQuotes( $row->afl_id ); + $saneDuplicate = $this->dbr->selectRow( + 'abuse_filter_log', + '1', + $findRow, + __METHOD__ + ); + + if ( $saneDuplicate ) { + // Just delete the row! + $deleted++; + if ( !$this->dryRun ) { + $this->dbw->delete( + 'abuse_filter_log', + [ 'afl_id' => $row->afl_id ], + __METHOD__ + ); + } + continue; + } + } + if ( $this->dryRun ) { + continue; + } + // Build a VariableHolder with the only values we can be sure of + $vars = AbuseFilterVariableHolder::newFromArray( [ + 'timestamp' => wfTimestamp( TS_UNIX, $row->afl_timestamp ), + 'action' => $row->afl_action + ] ); + // Add some action-specific variables + if ( strpos( $row->afl_action, 'createaccount' ) !== false ) { + $vars->setVar( 'accountname', $row->afl_user_text ); + } else { + $vars->setVar( 'user_name', $row->afl_user_text ); + $title = Title::makeTitle( $row->afl_namespace, $row->afl_title ); + if ( $row->afl_action !== 'move' ) { + $vars->setVar( 'page_title', $title->getText() ); + $vars->setVar( 'page_prefixedtitle', $title->getPrefixedText() ); + } else { + $vars->setVar( 'moved_from_title', $title->getText() ); + $vars->setVar( 'moved_from_prefixedtitle', $title->getPrefixedText() ); + } + } + + $storedID = AbuseFilter::storeVarDump( $vars ); + $this->dbw->update( + 'abuse_filter_log', + [ 'afl_var_dump' => "tt:$storedID" ], + [ 'afl_id' => $row->afl_id ], + __METHOD__ + ); + } + $rebuilt = $brokenRows->numRows() - $deleted; + return [ 'rebuilt' => $rebuilt, 'deleted' => $deleted ]; + } + + /** + * If afl_var_dump contains serialized data, move the dump to the text table. + * This handles point 1. of T213006. + */ + private function moveToText() { + $this->output( "...Moving serialized data away from the abuse_filter_log table (2/4).\n" ); + $batchSize = $this->getBatchSize(); + + $prevID = 0; + $curID = $batchSize; + $changeRows = $truncatedDumps = 0; + do { + $this->maybePrintProgress( $prevID ); + $res = $this->dbr->select( + 'abuse_filter_log', + [ 'afl_id', 'afl_var_dump' ], + [ + 'afl_var_dump NOT ' . $this->dbr->buildLike( + 'stored-text:', + $this->dbr->anyString() + ), + 'afl_var_dump NOT ' . $this->dbr->buildLike( + 'tt:', + $this->dbr->anyString() + ), + "afl_id > $prevID", + "afl_id <= $curID" + ], + __METHOD__, + [ 'ORDER BY' => 'afl_id ASC' ] + ); + + $prevID = $curID; + $curID += $batchSize; + + $result = $this->doMoveToText( $res ); + $changeRows += $result['change']; + $truncatedDumps += $result['truncated']; + MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication(); + $this->maybeSleep(); + } while ( $prevID <= $this->allRowsCount ); + + $msg = $this->dryRun ? + "...found $changeRows abuse_filter_log rows with serialized data and $truncatedDumps " . + "truncated dumps to rebuild.\n" : + "...moved $changeRows abuse_filter_log rows and rebuilt $truncatedDumps " . + "truncated dumps.\n"; + + $this->output( $msg ); + } + + /** + * @param IResultWrapper $rows + * @return int[] + */ + private function doMoveToText( IResultWrapper $rows ) { + $changeRows = $truncatedDumps = 0; + foreach ( $rows as $row ) { + // Sanity: perform a very raw check to confirm that the dump is indeed a serialized value + $re = '/^(a:\d+:{|O:25:"[Aa]buse[Ff]ilter[Vv]ariable[Hh]older":\d+:{)/'; + if ( !preg_match( $re, $row->afl_var_dump ) ) { + $this->fatalError( + "...found a value in afl_var_dump for afl_id {$row->afl_id} which is " . + "neither a reference to the text table or a serialized value: {$row->afl_var_dump}.\n" + ); + } + + AtEase::suppressWarnings(); + $stored = unserialize( $row->afl_var_dump ); + AtEase::restoreWarnings(); + if ( !$stored ) { + $re = '/^O:25:"[Aa]buse[Ff]ilter[Vv]ariable[Hh]older":\d+:{/'; + if ( preg_match( $re, $row->afl_var_dump ) ) { + $this->fatalError( + "...found a corrupted afl_var_dump for afl_id {$row->afl_id} containing " . + "a truncated object: {$row->afl_var_dump}.\n" + ); + } + $stored = $this->restoreTruncatedDump( $row->afl_var_dump ); + $truncatedDumps++; + } + if ( !is_array( $stored ) && !( $stored instanceof AbuseFilterVariableHolder ) ) { + $this->fatalError( + '...found unexpected data type ( ' . gettype( $stored ) . ' ) in ' . + "afl_var_dump for afl_id {$row->afl_id}.\n" + ); + } + $changeRows++; + + if ( !$this->dryRun ) { + $holder = is_array( $stored ) ? AbuseFilterVariableHolder::newFromArray( $stored ) : $stored; + // Note: this will upgrade to the new JSON format, so we use tt: + $newDump = AbuseFilter::storeVarDump( $holder ); + $this->dbw->update( + 'abuse_filter_log', + [ 'afl_var_dump' => "tt:$newDump" ], + [ 'afl_id' => $row->afl_id ], + __METHOD__ + ); + } + } + return [ 'change' => $changeRows, 'truncated' => $truncatedDumps ]; + } + + /** + * Try to restore a truncated dumps. This could happen for very old rows, where afl_var_dump + * was a blob instead of a longblob, and we tried to insert very long strings there. + * This handles point 9. of T214193. + * + * @param string $dump The broken serialized dump + * @return array With everything that we can restore from $dump on success + */ + private function restoreTruncatedDump( $dump ) { + // This method makes various assumptions: + // 1 - Everything is wrapped inside an array + // 2 - Array elements can only be strings, integers, bools or null + // 3 - Array keys can only be strings + // As this is what a serialized dump should look like. + $string = preg_replace( '/^a:\d+:{/', '', $dump ); + + $ret = []; + $key = null; + + while ( strlen( $string ) > 2 || $string === 'N;' ) { + $type = substr( $string, 0, 2 ); + switch ( $type ) { + case 's:': + // Quotes aren't escaped, so we need to figure out how many characters to include + $matches = []; + if ( !preg_match( '/^s:(\d+):"/', $string, $matches ) ) { + break 2; + } + $len = (int)$matches[1]; + $val = substr( $string, strlen( $matches[0] ), $len ); + if ( strlen( $val ) === $len ) { + if ( $key === null ) { + // It's an array key + $key = $val; + } else { + $ret[$key] = $val; + $key = null; + } + $offset = strlen( $matches[0] ) + $len + 2; + break; + } else { + // The truncation happened in the middle of the string + break 2; + } + case 'i:': + if ( preg_match( '/^i:(-?\d+);/', $string, $matches ) ) { + if ( $key === null ) { + throw new UnexpectedValueException( "Unexpected integer key: $string" ); + } + $ret[$key] = intval( $matches[1] ); + $key = null; + $offset = strlen( $matches[0] ); + break; + } else { + break 2; + } + case 'b:': + if ( preg_match( '/^b:([01]);/', $string, $matches ) ) { + if ( $key === null ) { + throw new UnexpectedValueException( "Unexpected bool key: $string" ); + } + $ret[$key] = (bool)$matches[1]; + $key = null; + $offset = 4; + break; + } else { + break 2; + } + case 'N;': + if ( $key === null ) { + throw new UnexpectedValueException( "Unexpected null key: $string" ); + } + $ret[$key] = null; + $key = null; + $offset = 2; + break; + default: + break 2; + } + + // Remove the value we have just parsed + // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable + // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal + $string = substr( $string, $offset ); + } + + if ( $this->hasOption( 'dry-run-verbose' ) ) { + $this->output( + "...converted the following corrupted dump:\n\n$dump\n\n to this:\n\n" . + var_export( $ret, true ) . "\n\n" + ); + } + + return $ret; + } + + /** + * If the text table (or the External Storage) contains a serialized AbuseFilterVariableHolder + * or array, re-store it as a JSON-encoded array. This assumes that afl_var_dump rows starting + * with 'tt:' already point to JSON dumps, and afl_var_dump rows starting with 'stored-text:' + * only point to serialized dumps. + * This handles point 2. and 6. of T213006. + */ + private function updateText() { + $this->output( + "...Re-storing serialized dumps as JSON-encoded arrays for all rows (3/4).\n" + ); + if ( $this->printOrphanedFile !== null && !$this->dryRun ) { + $this->output( "Printing orphaned records to $this->printOrphanedFile.\n" ); + file_put_contents( + $this->printOrphanedFile, + "Records orphaned by AbuseFilter's updateVarDumps sccript\n", + FILE_APPEND + ); + } + + $batchSize = $this->getBatchSize(); + $prevID = 0; + $curID = $batchSize; + $count = 0; + + $idSQL = $this->dbr->buildIntegerCast( $this->dbr->strreplace( + 'afl_var_dump', + $this->dbr->addQuotes( 'stored-text:' ), + $this->dbr->addQuotes( '' ) + ) ); + + $dumpLike = $this->dbr->buildLike( 'stored-text:', $this->dbr->anyString() ); + $esAccess = MediaWikiServices::getInstance()->getExternalStoreAccess(); + do { + $this->maybePrintProgress( $prevID ); + $res = $this->dbr->select( + [ 'text', 'abuse_filter_log' ], + [ 'old_id', 'old_text', 'old_flags' ], + [ + "afl_var_dump $dumpLike", + "afl_id > $prevID", + "afl_id <= $curID" + ], + __METHOD__, + [ 'DISTINCT', 'ORDER BY' => 'old_id ASC' ], + [ 'abuse_filter_log' => [ 'JOIN', "old_id = $idSQL" ] ] + ); + + $prevID = $curID; + $curID += $batchSize; + $count += $res->numRows(); + + if ( !$this->dryRun ) { + $this->doUpdateText( $res, $esAccess ); + MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication(); + } + $this->maybeSleep(); + } while ( $prevID <= $this->allRowsCount ); + + $msg = $this->dryRun + ? "...found $count text rows to update.\n" + : "...updated $count text rows.\n"; + $this->output( $msg ); + } + + /** + * @param IResultWrapper $res text rows + * @param ExternalStoreAccess $esAccess + */ + private function doUpdateText( IResultWrapper $res, ExternalStoreAccess $esAccess ) { + $orphaned = []; + foreach ( $res as $row ) { + // This is copied from AbuseFilter::loadVarDump + $oldFlags = explode( ',', $row->old_flags ); + $text = $row->old_text; + if ( in_array( 'external', $oldFlags ) ) { + $text = $esAccess->fetchFromURL( $row->old_text ); + } + if ( in_array( 'gzip', $oldFlags ) ) { + $text = gzinflate( $text ); + } + + if ( FormatJson::decode( $text ) !== null ) { + // Already in the new format, apparently. + if ( + !in_array( 'utf-8', $oldFlags, true ) || + in_array( 'nativeDataArray', $oldFlags, true ) + ) { + // Sanity + $this->fatalError( "Row {$row->old_id} is JSON-encoded with wrong flags: {$row->old_flags}" ); + } + continue; + } + + $obj = unserialize( $text ); + + $varArray = $obj instanceof AbuseFilterVariableHolder + ? $obj->dumpAllVars( [ 'old_wikitext', 'new_wikitext' ] ) + : $obj; + $varArray = $this->updateVariables( $varArray ); + // Recreating flags will also ensure that we don't add 'nativeDataArray' + $newFlags = [ 'utf-8' ]; + // This is copied from AbuseFilter::storeVarDump + $toStore = FormatJson::encode( $varArray ); + if ( in_array( 'gzip', $oldFlags ) && function_exists( 'gzdeflate' ) ) { + $toStore = gzdeflate( $toStore ); + $newFlags[] = 'gzip'; + } + if ( in_array( 'external', $oldFlags ) ) { + $orphaned[] = $row->old_text; + $toStore = $esAccess->insert( $toStore ); + $newFlags[] = 'external'; + } + + $this->dbw->update( + 'text', + [ + 'old_text' => $toStore, + 'old_flags' => implode( ',', $newFlags ) + ], + [ 'old_id' => $row->old_id ], + __METHOD__ + ); + } + if ( $this->printOrphanedFile !== null && $orphaned ) { + file_put_contents( $this->printOrphanedFile, implode( ', ', $orphaned ) . "\n", FILE_APPEND ); + } + } + + /** + * Given a stored object, removes some disabled variables and update deprecated ones. + * Also ensure that core variables are lowercase. + * Handles points 4., 5. and 8. of T213006. + * + * @param array $vars The stored vars. + * @return array + */ + private function updateVariables( array $vars ) { + // Remove all variables used in the past to store metadata + unset( $vars['context'], $vars['logged_local_ids'], $vars['logged_global_ids'] ); + + $builtinVars = $this->getBuiltinVarNames(); + $newVars = []; + foreach ( $vars as $oldName => $value ) { + $lowerName = strtolower( $oldName ); + if ( $lowerName !== $oldName && array_key_exists( $lowerName, $builtinVars ) ) { + $oldName = $lowerName; + } + $deprecatedVars = $this->keywordsManager->getDeprecatedVariables(); + $newName = $deprecatedVars[$oldName] ?? $oldName; + $newVars[$newName] = $value; + } + return $newVars; + } + + /** + * Get a set of builtin variable names. Copied from AbuseFilterVariableHolder::dumpAllVars. + * @return array [ varname => true ] for instantaneous search. All names are lowercase + */ + private function getBuiltinVarNames() { + global $wgRestrictionTypes; + + static $coreVariables = null; + + if ( $coreVariables ) { + return $coreVariables; + } + + $activeVariables = array_keys( $this->keywordsManager->getVarsMappings() ); + $deprecatedVariables = array_keys( $this->keywordsManager->getDeprecatedVariables() ); + $disabledVariables = array_keys( $this->keywordsManager->getDisabledVariables() ); + $coreVariables = array_merge( $activeVariables, $deprecatedVariables, $disabledVariables ); + + $prefixes = [ 'moved_from', 'moved_to', 'page' ]; + foreach ( $wgRestrictionTypes as $action ) { + foreach ( $prefixes as $prefix ) { + $coreVariables[] = "{$prefix}_restrictions_$action"; + } + } + + $coreVariables = array_fill_keys( $coreVariables, true ); + $coreVariables = array_change_key_case( $coreVariables ); + + return $coreVariables; + } + + /** + * Replace 'stored-text:' with 'tt:' in afl_var_dump. Handles point 3. of T213006. + */ + private function updateAflVarDump() { + $this->output( + "...Replacing the 'stored-text:' prefix with 'tt:' (4/4).\n" + ); + + $batchSize = $this->getBatchSize(); + + // Use native SQL functions so that we can update all rows at the same time. + $newIdSQL = $this->dbw->strreplace( + 'afl_var_dump', + $this->dbr->addQuotes( 'stored-text:' ), + $this->dbr->addQuotes( 'tt:' ) + ); + + $prevID = 0; + $curID = $batchSize; + $numRows = 0; + do { + $this->maybePrintProgress( $prevID ); + $args = [ + 'abuse_filter_log', + [ "afl_var_dump = $newIdSQL" ], + [ + "afl_id > $prevID", + "afl_id <= $curID", + 'afl_var_dump ' . $this->dbr->buildLike( 'stored-text:', $this->dbr->anyString() ) + ], + __METHOD__, + [ 'ORDER BY' => 'afl_id ASC' ] + ]; + if ( $this->dryRun ) { + $numRows += $this->dbr->selectRowCount( ...$args ); + } else { + $this->dbw->update( ...$args ); + $numRows += $this->dbw->affectedRows(); + MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication(); + } + + $prevID = $curID; + $curID += $batchSize; + $this->maybeSleep(); + } while ( $prevID <= $this->allRowsCount ); + + if ( $this->dryRun ) { + $this->output( "...would change afl_var_dump for $numRows rows.\n" ); + } else { + $this->output( "...updated afl_var_dump prefix for $numRows rows.\n" ); + } + } + + /** + * Print a progress marker if the respective option is enabled + * + * @param int $start + */ + private function maybePrintProgress( int $start ) : void { + if ( $this->progressMarkers && $start % ( 10 * $this->getBatchSize() ) === 0 ) { + $end = $start + $this->getBatchSize(); + $this->output( "...Doing range $start - $end\n" ); + } + } + + /** + * Sleep for a while, if required. Note: checking the value is several + * orders of magnitude faster than calling sleep(0). + */ + private function maybeSleep() : void { + if ( $this->sleep ) { + sleep( $this->sleep ); + } + } +} + +$maintClass = 'UpdateVarDumps'; +require_once RUN_MAINTENANCE_IF_MAIN; |