#!/usr/bin/perl -wT
# GPLv3+ Matija Nalis <mnalis-perl@axe.tomsoft.hr> started 20161124

our $VERSION='1.17';

use strict;
use Getopt::Long qw (:config auto_version);


# parse command line options 
my $DEBUG = 0;
my $opt_help = 0;
my $opt_enable_replication = 0;
my $opt_remove_no_auto_create_user = 0;
my $opt_enable_pxc_streaming = 0;
my @ignore_tables_all = ();
my @ignore_tables_data = ();

my $result = GetOptions ( "help|h" => \$opt_help,
                          "--ignore-table=s"   => \@ignore_tables_all,
                          "--ignore-table-data=s"   => \@ignore_tables_data,
                          "--master-data"   => \$opt_enable_replication,
                          "--source-data"   => \$opt_enable_replication,
                          "--remove-no-auto-create-user"   => \$opt_remove_no_auto_create_user,
                          "--enable-pxc-streaming-replication"   => \$opt_enable_pxc_streaming,
                          "debug=i"  => \$DEBUG,
                          );

my $mydb = shift @ARGV;
if ($mydb) {
  if ($mydb =~ /\./) { die "invalid database name \`$mydb\`" }	# people might send SQL dump filename as first param, and we only accept mysqldump input at STDIN !
} else {
  $opt_help = 2;
}

# fix wildcards from SQL to RegEx
sub fix_wild($)
{
  my ($wild) = @_;
  $wild =~ s/%/.*/g;
  return $wild;
}

# parse DO tables
my  @do_tables = ('%'); 
if (@ARGV) {
  @do_tables = ();
  foreach my $table (@ARGV) {
      if ($table =~ /^(.*)\.(.*)$/) {	# db.table format
        my $_db = $1; my $_tbl = $2;
        if ($_db eq $mydb) {
          push @do_tables, $_tbl;
        } else {
          die "extracting DB \`$mydb\`, but trying to do table \`$table\` ??"
        }
      } else {
        push @do_tables, $table;
      }
  }
}
my $do_tables_regex = '^(' . fix_wild(join('|', @do_tables)) . ')$';
undef @do_tables;

# parse IGNORE tables ALL
if (@ignore_tables_all) {
  for (my $i=0; $i < @ignore_tables_all; $i++) {
      if ($ignore_tables_all[$i] =~ /^(.*)\.(.*)$/) {	# db.table format
        my $_db = $1; my $_tbl = $2;
        if ($_db eq $mydb) {
          $ignore_tables_all[$i] = $_tbl;
        } else {
          die "extracting DB \`$mydb\`, but trying to ignore table \`$ignore_tables_all[$i]\` ??"
        }
      }
  }
}
my $ignore_tables_all_regex = '^(' . fix_wild(join('|', @ignore_tables_all)) . ')$';
undef @ignore_tables_all;

# parse IGNORE tables DATA
if (@ignore_tables_data) {
  for (my $i=0; $i < @ignore_tables_data; $i++) {
      if ($ignore_tables_data[$i] =~ /^(.*)\.(.*)$/) {	# db.table format
        my $_db = $1; my $_tbl = $2;
        if ($_db eq $mydb) {
          $ignore_tables_data[$i] = $_tbl;
        } else {
          die "extracting DB \`$mydb\`, but trying to ignore table data \`$ignore_tables_data[$i]\` ??"
        }
      }
  }
}
my $ignore_tables_data_regex = '^(' . fix_wild(join('|', @ignore_tables_data)) . ')$';
undef @ignore_tables_data;

                                                         
if ($opt_help) {
    print <<EOF;
mysqldumpextract v$VERSION

This script parses mysqldump backup file (usually created with "mysqldump --all-databases", and extracts only one database from it.
It can also extract only some tables from that database, or extract all tables except some of them.

Usage:
mysqldumpextract [--ignore-table=table]... [--ignore-table-data=table]... [--remove-no-auto-create-user] database [table]...
Options:
\t-h, --help
\t\tPrint this help screen
\t--ignore-table=table
\t\tignores specified table. May be repeated for multiple ignores.
\t--ignore-table-data=table
\t\tignores only INSERTS for specified table (eg. create structure only). May be repeated for multiple ignores.
\t--remove-no-auto-create-user
\t\tremoves NO_AUTO_CREATE_USER from sql_mode settings (for MySQL 8.0 compatibility)
\t--enable-pxc-streaming-replication
\t\tadds PXC streaming replication hint to all INSERT statements
\t--debug
\t\tset debug level

Example usage:
\t# All tables from database "mydb1" EXCEPT "table1" and "table2" from .gz backup
\tzcat mysqldump.sql.gz | mysqldumpextract --ignore-table=mydb1.table1 --ignore-table=table2 mydb1 | gzip > short_dump.sql.gz

\t# only table5 and table6 from database "mydb2", reading from .xz backup and compressing output to .bz2 
\txzcat mysqldump.sql.xz | mysqldumpextract mydb1 table5 table6 |bzip2 > table_5_6.sql.bz2

\t# extract all tables from mydb2 with names starting with "tt", but only extract structure for "tt_log"
\txzcat mysqldump.sql.xz | mysqldumpextract --ignore-table-data=tt_log mydb2 tt% 

\t# remove NO_AUTO_CREATE_USER for MySQL 8.0 compatibility
\tcat mysqldump.sql | mysqldumpextract --remove-no-auto-create-user database

\t# table names may include "%" as SQL wildcard.
\tcat mysqldump.sql | mysqldumpextract mysql help_%
EOF
	exit 3;
}

# prints debug info
sub dbg ($$) {
  my ($dbg, $msg) = @_;
  if ($DEBUG >= $dbg) {
    chomp $msg;
    print "-- (dbg$dbg): $msg\n"; 
  }
}

# removes NO_AUTO_CREATE_USER from sql_mode if requested
sub process_sql_mode($)
{
  my ($line) = @_;
  if ($opt_remove_no_auto_create_user) {
    if ($line =~ /SET\s+sql_mode\s*=/i) {
      # Remove NO_AUTO_CREATE_USER and clean up any resulting issues with commas
      $line =~ s/,?\s*NO_AUTO_CREATE_USER\s*,?/,/gi;
      # Clean up double commas or leading/trailing commas within the string
      $line =~ s/,\s*,/,/g;
      $line =~ s/=\s*',/= '/g;
      $line =~ s/,'/'/g;
      dbg(3, "Removed NO_AUTO_CREATE_USER from sql_mode");
    }
  }
  return $line;
}

# print headers
print "-- mysqldumpextract v$VERSION started at " . localtime() . "\n";
print "--   Extracting DB: $mydb\n";
print "--   Extract tables regex: $do_tables_regex\n";
print "--   Ignore  tables regex: $ignore_tables_all_regex\n";
print "--   Ignore    data regex: $ignore_tables_data_regex\n";
if ($opt_remove_no_auto_create_user) {
  print "--   Remove NO_AUTO_CREATE_USER: yes\n";
}
if ($opt_enable_pxc_streaming) {
  print "--   Enable PXC Streaming Replication: yes\n";
}
print "--\n\n";

# does current table matches entry specified list?
sub match_wild
{
  my ($cur_table, $list_ref) = @_;
  return ($cur_table =~ /$$list_ref/);
}

my $do_db = 1;	# we start by including everything (even before first DB, there are SETtings that are needed)
my $do_table = 1;
my $in_insert = 0;	# track if we're inside a multi-line INSERT statement

# updates $do_table if table matches. returns 1 if current table should be extracted, or 0 if not.
sub match_table()
{
  # -- Table structure for table `_artikli_mjeseci`
  # -- Temporary table structure for view `_Kupovina_kod_dobavljaca`
  # -- Final view structure for view `_vw_SellingArtikl_VP`
  if (/^-- (Table structure for table|Temporary table structure for view|Final view structure for view) `(.*?)`/) {		# only change $do_table state if new table is encountered
    my $cur_table = $2;
    $do_table = 0;
    $in_insert = 0;	# reset INSERT tracking when entering new table
    if (match_wild($cur_table, \$do_tables_regex)) {			# if matched DO_TABLES...
        dbg(6, "\`$cur_table\` matches DO TABLES");
        $do_table = 1;							# ... then by default include it...
        if (match_wild($cur_table, \$ignore_tables_all_regex)) {		# ... but do NOT include it if it matches IGNORE_TABLES_ALL
            dbg(6, "\`$cur_table\` does not match IGNORE TABLES");
            $do_table = 0;
        } elsif (match_wild($cur_table, \$ignore_tables_data_regex)) {		# ... and only include CREATE TABLE it matches IGNORE_TABLES_DATA
            dbg(6, "\`$cur_table\` does not match IGNORE TABLES DATA");
            $do_table = 2;		# 2 == skip INSERTs
        }
    }
    dbg(5, "final \`$cur_table\`, do_table=$do_table");
  }
  return $do_table;
}

# updates $do_db if database matches. returns 1 if current database should be extracted, or 0 if not.
sub match_db()
{
  if ((/^CHANGE MASTER/i) || (/^SET GLOBAL gtid_slave_pos=/i)) {
    if ($opt_enable_replication) {
      dbg(1, "replication requested, BEWARE, allowing pass-through of line=$_");
      return 1;
    } else {
      dbg(1, "replication not explicitly requested, ignoring line=$_");
      return 0;					# by default, when extracting partial data, ignore replication commands
    }
  }
  if (/^-- Current Database: `(.*?)`/) {	# only change $do_db state if new database is encountered
    if ("$1" eq "$mydb") {
      $do_db = 1; 
      $do_table = 1;
      $in_insert = 0;	# reset INSERT tracking when entering new database
      dbg(1, "matched DB=\`$mydb\`");
    } else {
      $do_db = 0;
      $in_insert = 0;	# reset INSERT tracking when leaving database
      dbg(1, "DB \`$1\` is no longer \`$mydb\`");
    };
  }
  if ($do_db) {					# if currently dumping data, do not include database drop/create/use commands
    if (/DROP DATABASE .*`$mydb`/i) { dbg(2,"ignore $_"); return 0; }
    if (/CREATE DATABASE .*`$mydb`/i) { dbg(2,"ignore $_"); return 0; }
    if (/^USE .*`$mydb`/i) { dbg(2,"ignore $_"); return 0; }
  }
  dbg(8, "do_db=$do_db, line=$_");
  return $do_db;
}

#
# main loop
#
while (<STDIN>) {
  next if !match_db();				# skip uninteresting databases alltogether
  my $should_print_table = match_table();
  if ($should_print_table == 1) {		# print ALL
      $_ = process_sql_mode($_);
      # Add PXC streaming replication hint if enabled
      if ($opt_enable_pxc_streaming && /^\s*INSERT\s+INTO/i) {
        s/INSERT\s+INTO/INSERT \/*+ SET_VAR(wsrep_trx_fragment_size=104857600) *\/ INTO/i;
        dbg(7, "Added PXC streaming hint: $_");
      }
      print;
  } elsif ($should_print_table == 2) {		# print ALL except INSERTs (that is, CREATE TABLEs)
      # Handle multi-line INSERT statements (MariaDB 10.11+)
      if (/^\s*INSERT /i) {
        # Start of INSERT statement
        $in_insert = 1;
        dbg(7, "Skipping INSERT start: $_");
        # Check if this is a single-line INSERT (ends with semicolon at end of line, not inside data)
        if (/\);\s*$/) {
          $in_insert = 0;
          dbg(7, "Single-line INSERT detected and skipped");
        }
      } elsif ($in_insert) {
        # We're inside a multi-line INSERT, skip until we find the closing parenthesis and semicolon
        dbg(7, "Skipping INSERT continuation: $_");
        if (/\);\s*$/) {
          $in_insert = 0;
          dbg(7, "End of multi-line INSERT detected");
        }
      } else {
        # Not an INSERT line, print it
        $_ = process_sql_mode($_);
        print;
      }
  }
}

print "\n-- mysqldumpextract v$VERSION ended at " . localtime() . "\n";

exit 0;