import psycopg2
import pymysql
from kafka import KafkaConsumer, TopicPartition
import configparser
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import re
import logging

# 配置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 加载配置文件
config = configparser.ConfigParser()
config.read('config.ini')

# Kafka 配置
kafka_bootstrap_servers = config.get('kafka', 'bootstrap_servers')
topic_prefix = config.get('kafka', 'topic_prefix')

# 数据库类型配置
db_type = config.get('database', 'type').lower()

# 数据库连接配置
if db_type == 'postgresql':
    db_config = {
        'dbname': config.get('postgresql', 'dbname'),
        'user': config.get('postgresql', 'user'),
        'password': config.get('postgresql', 'password'),
        'host': config.get('postgresql', 'host'),
        'port': config.get('postgresql', 'port')
    }
elif db_type == 'mysql':
    db_config = {
        'database': config.get('mysql', 'dbname'),
        'user': config.get('mysql', 'user'),
        'password': config.get('mysql', 'password'),
        'host': config.get('mysql', 'host'),
        'port': int(config.get('mysql', 'port'))
    }
else:
    raise ValueError("Unsupported database type. Use 'postgresql' or 'mysql'.")

# 数据库表名前缀配置
table_prefix = config.get('database', 'table_prefix')

# 创建表的 SQL 语句
def create_table_if_not_exists(cursor, table_name):
    if db_type == 'postgresql':
        create_table_sql = f"""
            CREATE TABLE IF NOT EXISTS {table_name} (
                data JSONB NOT NULL,
                offset_id BIGINT NOT NULL,
                partition_id INT NOT NULL,
                PRIMARY KEY (offset_id, partition_id)
            )
        """
    elif db_type == 'mysql':
        create_table_sql = f"""
            CREATE TABLE IF NOT EXISTS {table_name} (
                data JSON NOT NULL,
                offset_id BIGINT NOT NULL,
                partition_id INT NOT NULL,
                PRIMARY KEY (offset_id, partition_id)
            )
        """
    cursor.execute(create_table_sql)

# 初始化 Kafka Consumer
consumer = KafkaConsumer(
    bootstrap_servers=kafka_bootstrap_servers,
    auto_offset_reset='earliest',
    enable_auto_commit=False,  # 手动提交偏移量
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))  # 假设数据是 JSON 格式，进行反序列化
)

# 获取所有主题
topics = consumer.topics()

# 匹配符合通配符模式的主题，并创建相应的表
matching_topics = [topic for topic in topics if re.match(f'{topic_prefix}.*', topic)]
for topic in matching_topics:
    table_name = f"{table_prefix}{topic.replace('-', '_').replace('.', '_')}"  # 将 '-' 和 '.' 替换为 '_'
    if db_type == 'postgresql':
        connection = psycopg2.connect(**db_config)
    elif db_type == 'mysql':
        connection = pymysql.connect(**db_config)
    with connection.cursor() as cursor:
        create_table_if_not_exists(cursor, table_name)
    connection.commit()
    logging.info(f"已创建表: {table_name}")
    connection.close()

# 获取所有订阅主题的分区列表
def get_partitions():
    partitions = []
    for topic in matching_topics:
        topic_partitions = consumer.partitions_for_topic(topic)
        if topic_partitions:
            for partition in topic_partitions:
                partitions.append(TopicPartition(topic, partition))
    return partitions

# 获取上次保存的偏移量
def load_last_offsets(partitions):
    offsets = {}
    for partition in partitions:
        table_name = f"{table_prefix}{partition.topic.replace('-', '_').replace('.', '_')}"
        select_query = f"""
            SELECT offset_id
            FROM {table_name}
            WHERE partition_id = %s
            ORDER BY offset_id DESC
            LIMIT 1
        """
        if db_type == 'postgresql':
            connection = psycopg2.connect(**db_config)
        elif db_type == 'mysql':
            connection = pymysql.connect(**db_config)
        with connection.cursor() as cursor:
            cursor.execute(select_query, (partition.partition,))
            row = cursor.fetchone()
            offsets[partition] = row[0] if row else None
        connection.close()
    return offsets

# 如果需要从上次保存的偏移量开始消费，则加载该偏移量
partitions = get_partitions()
last_offsets = load_last_offsets(partitions)
consumer.assign(partitions)
for partition, offset in last_offsets.items():
    if offset is not None:
        consumer.seek(partition, offset + 1)

# 定义处理消息的函数
def process_message(message):
    try:
        data = message.value
        offset_id = message.offset
        partition_id = message.partition

        table_name = f"{table_prefix}{message.topic.replace('-', '_').replace('.', '_')}"
        insert_query = f"""
            INSERT INTO {table_name} (data, offset_id, partition_id)
            VALUES (%s, %s, %s)
        """
        if db_type == 'postgresql':
            connection = psycopg2.connect(**db_config)
        elif db_type == 'mysql':
            connection = pymysql.connect(**db_config)
        with connection.cursor() as cursor:
            cursor.execute(insert_query, (json.dumps(data), offset_id, partition_id))
        connection.commit()
        logging.info(f"已插入来自主题 {message.topic}, 分区 {message.partition}, 偏移量 {message.offset} 的消息")
        connection.close()
        return (message.topic, message.partition, message.offset)
    except Exception as e:
        logging.error(f"处理消息时发生错误: {e}")
        raise e  # 重新引发异常以在主执行块中处理

# 处理消息并将数据和偏移量插入到数据库
try:
    with ThreadPoolExecutor(max_workers=10) as executor:
        while True:
            futures = []
            for message in consumer:
                futures.append(executor.submit(process_message, message))

            # 等待所有任务完成
            for future in as_completed(futures):
                try:
                    result = future.result()
                    logging.info(f"已处理来自主题 {result[0]}, 分区 {result[1]}, 偏移量 {result[2]} 的消息")
                except Exception as e:
                    logging.error(f"处理消息时发生异常: {e}")

except Exception as e:
    logging.error(f"主执行发生错误: {e}")
finally:
    consumer.close()
