commit 05a8338c7278e0fcec2a664b2f822534e344772f
Author: TaurusXin <i@taurusxin.com>
Date:   Mon Sep 30 11:05:35 2024 +0800

    init repo

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..90908ea
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+# project files
+dataset/
+
+*.pth
+
+# python files
+__pycache__/
+
+.venv
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d8fbbe8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# Captcha Recognition
+
+基于深度神经网络(DNN)的验证码识别
+
+## 使用方法
+
+1. 克隆项目到本地
+
+```shell
+git clone https://git.taurusxin.com/taurusxin/captcha.git
+cd captcha
+```
+
+2. 创建虚拟环境并安装依赖
+
+```shell
+python -m venv .venv
+
+# Windows
+.venv\Scripts\Activate.ps1
+
+# Linux/MacOS
+source .venv/bin/activate
+
+# 先安装 PyTorch GPU 版本，cuda 12.4
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+
+# 然后再安装其他依赖
+pip install -r requirements.txt
+```
+
+3. 根据提示生成数据集，生成3次数据集，分别用于训练，用于测试，用于验证。
+
+建议的数据集长度如下：
+
+| 数据集    | 长度     |
+| -------- | ------- |
+| Train    | 50000   |
+| Test     | 1000    |
+| Predict  | 30      |
+
+```shell
+python captcha_gen.py
+```
+
+4. 训练模型
+
+```shell
+python train.py
+```
+
+5. 测试模型
+
+```shell
+python test.py
+```
+
+6. 预测验证码
+
+```shell
+python predict.py
+```
diff --git a/captcha_gen.py b/captcha_gen.py
new file mode 100644
index 0000000..ef53341
--- /dev/null
+++ b/captcha_gen.py
@@ -0,0 +1,39 @@
+import captcha_settings
+import os
+import random
+
+from captcha.image import ImageCaptcha
+from PIL import Image
+
+from tqdm import trange
+
+def random_captcha_text(char_set=captcha_settings.NUMBER + captcha_settings.ALPHABET, captcha_size=4):
+    captcha_text = []
+    for i in range(captcha_size):
+        c = random.choice(char_set)
+        captcha_text.append(c)
+    return "".join(captcha_text)
+
+def gen_captcha_text_and_image():
+    image = ImageCaptcha()
+    captcha_text = random_captcha_text()
+    captcha_image = Image.open(image.generate(captcha_text))
+    return captcha_text, captcha_image
+
+if __name__ == "__main__":
+    dataset_type = input("请输入数据集类型（1 - train / 2 - test / 3 - predict）：")
+    dataset_len = input("请输入数据集长度：")
+
+    paths = [captcha_settings.TRAIN_DATASET_PATH, captcha_settings.TEST_DATASET_PATH, captcha_settings.PREDICT_DATASET_PATH]
+
+    dataset_type = int(dataset_type)
+    count = int(dataset_len)
+    path = (
+        paths[dataset_type - 1]
+    )
+    if not os.path.exists(path):
+        os.makedirs(path)
+    for i in trange(count):
+        text, image = gen_captcha_text_and_image()
+        filename = f"{str(i).zfill(5)}_{text}.png"
+        image.save(path + os.path.sep + filename)
diff --git a/captcha_settings.py b/captcha_settings.py
new file mode 100644
index 0000000..5408a41
--- /dev/null
+++ b/captcha_settings.py
@@ -0,0 +1,18 @@
+import os
+# 验证码中的字符
+# string.digits + string.ascii_uppercase
+NUMBER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+ALPHABET = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
+
+ALL_CHAR_SET = NUMBER + ALPHABET
+ALL_CHAR_SET_STR = ''.join(ALL_CHAR_SET)
+ALL_CHAR_SET_LEN = len(ALL_CHAR_SET)
+MAX_CAPTCHA = 4
+
+# 图像大小
+IMAGE_HEIGHT = 60
+IMAGE_WIDTH = 160
+
+TRAIN_DATASET_PATH = 'dataset' + os.path.sep + 'train'
+TEST_DATASET_PATH = 'dataset' + os.path.sep + 'test'
+PREDICT_DATASET_PATH = 'dataset' + os.path.sep + 'predict'
\ No newline at end of file
diff --git a/cnn_net.py b/cnn_net.py
new file mode 100644
index 0000000..7931344
--- /dev/null
+++ b/cnn_net.py
@@ -0,0 +1,84 @@
+import torch.nn as nn
+import captcha_settings
+
+
+class ConvNet(nn.Module):
+    def __init__(self):
+        super(ConvNet, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2),
+        )
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2),
+        )
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(128, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2),
+        )
+        self.layer4 = nn.Sequential(
+            nn.Conv2d(256, 512, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2),
+        )
+        self.layer5 = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(in_features=15360, out_features=4096),
+            nn.Dropout(0.5),
+            nn.ReLU(),
+            nn.Linear(
+                4096,
+                captcha_settings.MAX_CAPTCHA * captcha_settings.ALL_CHAR_SET_LEN,
+            ),
+        )
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        return x
+
+
+# class ConvNet(nn.Module):
+#     def __init__(self):
+#         super(ConvNet, self).__init__()
+#         self.layer1 = nn.Sequential(
+#             nn.Conv2d(1, 32, kernel_size=3, padding=1),
+#             nn.BatchNorm2d(32),
+#             nn.Dropout(0.5),  # drop 50% of the neuron
+#             nn.ReLU(),
+#             nn.MaxPool2d(2))
+#         self.layer2 = nn.Sequential(
+#             nn.Conv2d(32, 64, kernel_size=3, padding=1),
+#             nn.BatchNorm2d(64),
+#             nn.Dropout(0.5),  # drop 50% of the neuron
+#             nn.ReLU(),
+#             nn.MaxPool2d(2))
+#         self.layer3 = nn.Sequential(
+#             nn.Conv2d(64, 64, kernel_size=3, padding=1),
+#             nn.BatchNorm2d(64),
+#             nn.Dropout(0.5),  # drop 50% of the neuron
+#             nn.ReLU(),
+#             nn.MaxPool2d(2))
+#         self.fc = nn.Sequential(
+#             nn.Linear((captcha_settings.IMAGE_WIDTH//8)*(captcha_settings.IMAGE_HEIGHT//8)*64, 1024),
+#             nn.Dropout(0.5),  # drop 50% of the neuron
+#             nn.ReLU())
+#         self.rfc = nn.Sequential(
+#             nn.Linear(1024, captcha_settings.MAX_CAPTCHA*captcha_settings.ALL_CHAR_SET_LEN),
+#         )
+
+#     def forward(self, x):
+#         out = self.layer1(x)
+#         out = self.layer2(out)
+#         out = self.layer3(out)
+#         out = out.view(out.size(0), -1)
+#         out = self.fc(out)
+#         out = self.rfc(out)
+#         return out
\ No newline at end of file
diff --git a/dataset.py b/dataset.py
new file mode 100644
index 0000000..3c5920d
--- /dev/null
+++ b/dataset.py
@@ -0,0 +1,50 @@
+import os
+from torch.utils.data import Dataset, DataLoader
+import torchvision.transforms as transforms
+from PIL import Image
+import one_hot_encoding as ohe
+from captcha_settings import TRAIN_DATASET_PATH, TEST_DATASET_PATH, PREDICT_DATASET_PATH
+
+class CaptchaDataset(Dataset):
+    def __init__(self, dir, transform=None):
+        # list all image files in the directory
+        self.train_images = [os.path.join(dir, image_file) for image_file in os.listdir(dir)]
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.train_images)
+    
+    def __getitem__(self, idx):
+        # load the image and convert it to grayscale
+        image_root = self.train_images[idx]
+        image_name = image_root.split(os.path.sep)[-1]
+        image = Image.open(image_root)
+        if self.transform is not None:
+            image = self.transform(image)
+        label = ohe.encode(image_name.split('.')[0].split('_')[-1])
+        return image, label
+
+transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Grayscale(),
+])
+
+def get_train_loader(batch_size=60):
+    dataset = CaptchaDataset(TRAIN_DATASET_PATH, transform)
+    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+def get_test_loader(batch_size=60):
+    dataset = CaptchaDataset(TEST_DATASET_PATH, transform)
+    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+def get_predict_loader(batch_size=60):
+    dataset = CaptchaDataset(PREDICT_DATASET_PATH, transform)
+    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+def main():
+    train_loader = get_train_loader()
+    for i, (image, label) in enumerate(train_loader):
+        print(image.shape, label.shape)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/one_hot_encoding.py b/one_hot_encoding.py
new file mode 100644
index 0000000..c4ff79b
--- /dev/null
+++ b/one_hot_encoding.py
@@ -0,0 +1,34 @@
+import numpy as np
+import captcha_settings
+import torch
+
+
+# 用torch.zeros()函数生成一个4行36列，值全是0的张量。接着循环标签中的各个字符，将字符在captcha_settings.ALL_CHAR_SET_STR中对应的索引获取到，然后将张量中对应位置的0，改成1。最后要返回一个一维的列表，长度是4*36=144
+def encode(label):
+    """将字符转为独热码"""
+    cols = len(captcha_settings.ALL_CHAR_SET_STR)
+    rows = captcha_settings.MAX_CAPTCHA
+    result = torch.zeros((rows, cols), dtype=float)
+    for i, char in enumerate(label):
+        j = captcha_settings.ALL_CHAR_SET_STR.index(char)
+        result[i, j] = 1.0
+    return result.view(1, -1)[0]
+
+
+# 将模型预测的值从一维转成4行36列的二维张量，然后调用torch.argmax()函数寻找每一行最大值（也就是1）的索引。知道索引后就可以从captcha_settings.ALL_CHAR_SET_STR中找到对应的字符
+def decode(pred_result):
+    """将独热码转为字符"""
+    pred_result = pred_result.view(-1, len(captcha_settings.ALL_CHAR_SET_STR))
+    index_list = torch.argmax(pred_result, dim=1)
+    text = "".join([captcha_settings.ALL_CHAR_SET_STR[i] for i in index_list])
+    return text
+
+def main():
+    label = "ABCD"
+    one_hot_label = encode(label)
+    print(one_hot_label)
+    decoded_label = decode(one_hot_label)
+    print(decoded_label)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/predict.py b/predict.py
new file mode 100644
index 0000000..f9e287e
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,50 @@
+import torch
+from torchvision import transforms
+from PIL import Image
+import matplotlib.pyplot as plt
+from cnn_net import ConvNet
+import os
+import random
+import captcha_settings
+import one_hot_encoding
+
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"Using {device} device")
+
+def predict(model, file_path):
+    trans = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Grayscale()
+    ])
+    with torch.no_grad():
+        X = trans(Image.open(file_path)).reshape(1, 1, 60, 160)
+        X = X.to(device)
+        pred = model(X)
+        text = one_hot_encoding.decode(pred)
+        return text
+
+def main():
+    model = ConvNet().to(device)
+    model.load_state_dict(torch.load(f"./model.pth", weights_only=True))
+    model.eval()
+
+    # random pickup some test images
+    pickup_count = 30
+    pickup_rect = [5, 6]
+    files = os.listdir(captcha_settings.PREDICT_DATASET_PATH)
+    images_picked = random.sample(files, pickup_count)
+    
+    # show as a grid, with predicted text, correct or not
+    fig, axes = plt.subplots(nrows=pickup_rect[0], ncols=pickup_rect[1], figsize=(10, 8))
+    for i, image_name in enumerate(images_picked):
+        real_text = image_name.split(".")[0].split("_")[-1]
+        file_path = os.path.join(captcha_settings.TEST_DATASET_PATH, image_name)
+        pred_text = predict(model, file_path)
+        correct = real_text == pred_text
+        axes[i//pickup_rect[1], i%pickup_rect[1]].imshow(plt.imread(file_path))
+        axes[i//pickup_rect[1], i%pickup_rect[1]].set_title(f"{pred_text}, {'yes' if correct else 'no'}")
+        axes[i//pickup_rect[1], i%pickup_rect[1]].axis('off')
+    plt.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..258bb24
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,30 @@
+captcha==0.6.0
+colorama==0.4.6
+contourpy==1.3.0
+cycler==0.12.1
+filelock==3.13.1
+fonttools==4.54.1
+fsspec==2024.2.0
+Jinja2==3.1.3
+joblib==1.4.2
+kiwisolver==1.4.7
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.3
+packaging==24.1
+pillow==10.2.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+scikit-learn==1.5.2
+scipy==1.14.1
+setuptools==70.0.0
+six==1.16.0
+sympy==1.12
+threadpoolctl==3.5.0
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+tqdm==4.66.5
+typing_extensions==4.9.0
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..ace9bc6
--- /dev/null
+++ b/test.py
@@ -0,0 +1,45 @@
+import os
+import torch
+from PIL import Image
+from cnn_net import ConvNet
+import one_hot_encoding
+from torchvision import transforms
+import captcha_settings
+from tqdm import tqdm
+
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"Using {device} device")
+
+
+def predict(model, file_path):
+    trans = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Grayscale()
+    ])
+    with torch.no_grad():
+        X = trans(Image.open(file_path)).reshape(1, 1, 60, 160)
+        X = X.to(device)
+        pred = model(X)
+        text = one_hot_encoding.decode(pred)
+        return text
+
+
+def main():
+    model = ConvNet().to(device)
+    model.load_state_dict(torch.load(f"./model.pth", weights_only=True))
+    model.eval()
+
+    correct = 0
+    total = len(os.listdir(captcha_settings.TEST_DATASET_PATH))
+    for filename in tqdm(os.listdir(captcha_settings.TEST_DATASET_PATH)):
+        file_path = f"{captcha_settings.TEST_DATASET_PATH}{os.path.sep}{filename}"
+        real_captcha = filename.split('.')[0].split('_')[-1]
+        pred_captcha = predict(model, file_path)
+        if pred_captcha == real_captcha:
+            correct += 1
+    accuracy = f"Test {total} files, accuracy: {correct / total * 100:.2f}%"
+    print(accuracy)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..968bfa9
--- /dev/null
+++ b/train.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+
+from tqdm import tqdm
+
+from cnn_net import ConvNet
+import dataset
+
+num_epochs = 10
+learning_rate = 0.001
+
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+
+def main():
+    model = ConvNet().to(device)
+
+    model.train()
+
+    criterion = nn.MultiLabelSoftMarginLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    train_dataloader = dataset.get_train_loader()
+
+    for epoch in range(num_epochs):
+        print("Epoch:", epoch+1)
+        pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
+        for i, (images, labels) in pbar:
+            images, labels = images.to(device), labels.to(device)
+
+            predict_labels = model(images)
+
+            loss = criterion(predict_labels, labels)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            pbar.set_description("loss: %.4f" % loss.item())
+
+        print("loss:", loss.item(), '\n')
+        torch.save(model.state_dict(), "./model.pth")
+
+if __name__ == "__main__":
+    main()