commit 05a8338c7278e0fcec2a664b2f822534e344772f Author: TaurusXin Date: Mon Sep 30 11:05:35 2024 +0800 init repo diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..90908ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +# project files +dataset/ + +*.pth + +# python files +__pycache__/ + +.venv \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d8fbbe8 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# Captcha Recognition + +基于深度神经网络(DNN)的验证码识别 + +## 使用方法 + +1. 克隆项目到本地 + +```shell +git clone https://git.taurusxin.com/taurusxin/captcha.git +cd captcha +``` + +2. 创建虚拟环境并安装依赖 + +```shell +python -m venv .venv + +# Windows +.venv\Scripts\Activate.ps1 + +# Linux/MacOS +source .venv/bin/activate + +# 先安装 PyTorch GPU 版本,cuda 12.4 +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 + +# 然后再安装其他依赖 +pip install -r requirements.txt +``` + +3. 根据提示生成数据集,生成3次数据集,分别用于训练,用于测试,用于验证。 + +建议的数据集长度如下: + +| 数据集 | 长度 | +| -------- | ------- | +| Train | 50000 | +| Test | 1000 | +| Predict | 30 | + +```shell +python captcha_gen.py +``` + +4. 训练模型 + +```shell +python train.py +``` + +5. 测试模型 + +```shell +python test.py +``` + +6. 预测验证码 + +```shell +python predict.py +``` diff --git a/captcha_gen.py b/captcha_gen.py new file mode 100644 index 0000000..ef53341 --- /dev/null +++ b/captcha_gen.py @@ -0,0 +1,39 @@ +import captcha_settings +import os +import random + +from captcha.image import ImageCaptcha +from PIL import Image + +from tqdm import trange + +def random_captcha_text(char_set=captcha_settings.NUMBER + captcha_settings.ALPHABET, captcha_size=4): + captcha_text = [] + for i in range(captcha_size): + c = random.choice(char_set) + captcha_text.append(c) + return "".join(captcha_text) + +def gen_captcha_text_and_image(): + image = ImageCaptcha() + captcha_text = random_captcha_text() + captcha_image = Image.open(image.generate(captcha_text)) + return captcha_text, captcha_image + +if __name__ == "__main__": + dataset_type = input("请输入数据集类型(1 - train / 2 - test / 3 - predict):") + dataset_len = input("请输入数据集长度:") + + paths = [captcha_settings.TRAIN_DATASET_PATH, captcha_settings.TEST_DATASET_PATH, captcha_settings.PREDICT_DATASET_PATH] + + dataset_type = int(dataset_type) + count = int(dataset_len) + path = ( + paths[dataset_type - 1] + ) + if not os.path.exists(path): + os.makedirs(path) + for i in trange(count): + text, image = gen_captcha_text_and_image() + filename = f"{str(i).zfill(5)}_{text}.png" + image.save(path + os.path.sep + filename) diff --git a/captcha_settings.py b/captcha_settings.py new file mode 100644 index 0000000..5408a41 --- /dev/null +++ b/captcha_settings.py @@ -0,0 +1,18 @@ +import os +# 验证码中的字符 +# string.digits + string.ascii_uppercase +NUMBER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] +ALPHABET = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] + +ALL_CHAR_SET = NUMBER + ALPHABET +ALL_CHAR_SET_STR = ''.join(ALL_CHAR_SET) +ALL_CHAR_SET_LEN = len(ALL_CHAR_SET) +MAX_CAPTCHA = 4 + +# 图像大小 +IMAGE_HEIGHT = 60 +IMAGE_WIDTH = 160 + +TRAIN_DATASET_PATH = 'dataset' + os.path.sep + 'train' +TEST_DATASET_PATH = 'dataset' + os.path.sep + 'test' +PREDICT_DATASET_PATH = 'dataset' + os.path.sep + 'predict' \ No newline at end of file diff --git a/cnn_net.py b/cnn_net.py new file mode 100644 index 0000000..7931344 --- /dev/null +++ b/cnn_net.py @@ -0,0 +1,84 @@ +import torch.nn as nn +import captcha_settings + + +class ConvNet(nn.Module): + def __init__(self): + super(ConvNet, self).__init__() + self.layer1 = nn.Sequential( + nn.Conv2d(1, 64, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2), + ) + self.layer2 = nn.Sequential( + nn.Conv2d(64, 128, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2), + ) + self.layer3 = nn.Sequential( + nn.Conv2d(128, 256, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2), + ) + self.layer4 = nn.Sequential( + nn.Conv2d(256, 512, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2), + ) + self.layer5 = nn.Sequential( + nn.Flatten(), + nn.Linear(in_features=15360, out_features=4096), + nn.Dropout(0.5), + nn.ReLU(), + nn.Linear( + 4096, + captcha_settings.MAX_CAPTCHA * captcha_settings.ALL_CHAR_SET_LEN, + ), + ) + + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.layer5(x) + return x + + +# class ConvNet(nn.Module): +# def __init__(self): +# super(ConvNet, self).__init__() +# self.layer1 = nn.Sequential( +# nn.Conv2d(1, 32, kernel_size=3, padding=1), +# nn.BatchNorm2d(32), +# nn.Dropout(0.5), # drop 50% of the neuron +# nn.ReLU(), +# nn.MaxPool2d(2)) +# self.layer2 = nn.Sequential( +# nn.Conv2d(32, 64, kernel_size=3, padding=1), +# nn.BatchNorm2d(64), +# nn.Dropout(0.5), # drop 50% of the neuron +# nn.ReLU(), +# nn.MaxPool2d(2)) +# self.layer3 = nn.Sequential( +# nn.Conv2d(64, 64, kernel_size=3, padding=1), +# nn.BatchNorm2d(64), +# nn.Dropout(0.5), # drop 50% of the neuron +# nn.ReLU(), +# nn.MaxPool2d(2)) +# self.fc = nn.Sequential( +# nn.Linear((captcha_settings.IMAGE_WIDTH//8)*(captcha_settings.IMAGE_HEIGHT//8)*64, 1024), +# nn.Dropout(0.5), # drop 50% of the neuron +# nn.ReLU()) +# self.rfc = nn.Sequential( +# nn.Linear(1024, captcha_settings.MAX_CAPTCHA*captcha_settings.ALL_CHAR_SET_LEN), +# ) + +# def forward(self, x): +# out = self.layer1(x) +# out = self.layer2(out) +# out = self.layer3(out) +# out = out.view(out.size(0), -1) +# out = self.fc(out) +# out = self.rfc(out) +# return out \ No newline at end of file diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..3c5920d --- /dev/null +++ b/dataset.py @@ -0,0 +1,50 @@ +import os +from torch.utils.data import Dataset, DataLoader +import torchvision.transforms as transforms +from PIL import Image +import one_hot_encoding as ohe +from captcha_settings import TRAIN_DATASET_PATH, TEST_DATASET_PATH, PREDICT_DATASET_PATH + +class CaptchaDataset(Dataset): + def __init__(self, dir, transform=None): + # list all image files in the directory + self.train_images = [os.path.join(dir, image_file) for image_file in os.listdir(dir)] + self.transform = transform + + def __len__(self): + return len(self.train_images) + + def __getitem__(self, idx): + # load the image and convert it to grayscale + image_root = self.train_images[idx] + image_name = image_root.split(os.path.sep)[-1] + image = Image.open(image_root) + if self.transform is not None: + image = self.transform(image) + label = ohe.encode(image_name.split('.')[0].split('_')[-1]) + return image, label + +transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Grayscale(), +]) + +def get_train_loader(batch_size=60): + dataset = CaptchaDataset(TRAIN_DATASET_PATH, transform) + return DataLoader(dataset, batch_size=batch_size, shuffle=True) + +def get_test_loader(batch_size=60): + dataset = CaptchaDataset(TEST_DATASET_PATH, transform) + return DataLoader(dataset, batch_size=batch_size, shuffle=True) + +def get_predict_loader(batch_size=60): + dataset = CaptchaDataset(PREDICT_DATASET_PATH, transform) + return DataLoader(dataset, batch_size=batch_size, shuffle=True) + +def main(): + train_loader = get_train_loader() + for i, (image, label) in enumerate(train_loader): + print(image.shape, label.shape) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/one_hot_encoding.py b/one_hot_encoding.py new file mode 100644 index 0000000..c4ff79b --- /dev/null +++ b/one_hot_encoding.py @@ -0,0 +1,34 @@ +import numpy as np +import captcha_settings +import torch + + +# 用torch.zeros()函数生成一个4行36列,值全是0的张量。接着循环标签中的各个字符,将字符在captcha_settings.ALL_CHAR_SET_STR中对应的索引获取到,然后将张量中对应位置的0,改成1。最后要返回一个一维的列表,长度是4*36=144 +def encode(label): + """将字符转为独热码""" + cols = len(captcha_settings.ALL_CHAR_SET_STR) + rows = captcha_settings.MAX_CAPTCHA + result = torch.zeros((rows, cols), dtype=float) + for i, char in enumerate(label): + j = captcha_settings.ALL_CHAR_SET_STR.index(char) + result[i, j] = 1.0 + return result.view(1, -1)[0] + + +# 将模型预测的值从一维转成4行36列的二维张量,然后调用torch.argmax()函数寻找每一行最大值(也就是1)的索引。知道索引后就可以从captcha_settings.ALL_CHAR_SET_STR中找到对应的字符 +def decode(pred_result): + """将独热码转为字符""" + pred_result = pred_result.view(-1, len(captcha_settings.ALL_CHAR_SET_STR)) + index_list = torch.argmax(pred_result, dim=1) + text = "".join([captcha_settings.ALL_CHAR_SET_STR[i] for i in index_list]) + return text + +def main(): + label = "ABCD" + one_hot_label = encode(label) + print(one_hot_label) + decoded_label = decode(one_hot_label) + print(decoded_label) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..f9e287e --- /dev/null +++ b/predict.py @@ -0,0 +1,50 @@ +import torch +from torchvision import transforms +from PIL import Image +import matplotlib.pyplot as plt +from cnn_net import ConvNet +import os +import random +import captcha_settings +import one_hot_encoding + +device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" +print(f"Using {device} device") + +def predict(model, file_path): + trans = transforms.Compose([ + transforms.ToTensor(), + transforms.Grayscale() + ]) + with torch.no_grad(): + X = trans(Image.open(file_path)).reshape(1, 1, 60, 160) + X = X.to(device) + pred = model(X) + text = one_hot_encoding.decode(pred) + return text + +def main(): + model = ConvNet().to(device) + model.load_state_dict(torch.load(f"./model.pth", weights_only=True)) + model.eval() + + # random pickup some test images + pickup_count = 30 + pickup_rect = [5, 6] + files = os.listdir(captcha_settings.PREDICT_DATASET_PATH) + images_picked = random.sample(files, pickup_count) + + # show as a grid, with predicted text, correct or not + fig, axes = plt.subplots(nrows=pickup_rect[0], ncols=pickup_rect[1], figsize=(10, 8)) + for i, image_name in enumerate(images_picked): + real_text = image_name.split(".")[0].split("_")[-1] + file_path = os.path.join(captcha_settings.TEST_DATASET_PATH, image_name) + pred_text = predict(model, file_path) + correct = real_text == pred_text + axes[i//pickup_rect[1], i%pickup_rect[1]].imshow(plt.imread(file_path)) + axes[i//pickup_rect[1], i%pickup_rect[1]].set_title(f"{pred_text}, {'yes' if correct else 'no'}") + axes[i//pickup_rect[1], i%pickup_rect[1]].axis('off') + plt.show() + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..258bb24 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +captcha==0.6.0 +colorama==0.4.6 +contourpy==1.3.0 +cycler==0.12.1 +filelock==3.13.1 +fonttools==4.54.1 +fsspec==2024.2.0 +Jinja2==3.1.3 +joblib==1.4.2 +kiwisolver==1.4.7 +MarkupSafe==2.1.5 +matplotlib==3.9.2 +mpmath==1.3.0 +networkx==3.2.1 +numpy==1.26.3 +packaging==24.1 +pillow==10.2.0 +pyparsing==3.1.4 +python-dateutil==2.9.0.post0 +scikit-learn==1.5.2 +scipy==1.14.1 +setuptools==70.0.0 +six==1.16.0 +sympy==1.12 +threadpoolctl==3.5.0 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +tqdm==4.66.5 +typing_extensions==4.9.0 diff --git a/test.py b/test.py new file mode 100644 index 0000000..ace9bc6 --- /dev/null +++ b/test.py @@ -0,0 +1,45 @@ +import os +import torch +from PIL import Image +from cnn_net import ConvNet +import one_hot_encoding +from torchvision import transforms +import captcha_settings +from tqdm import tqdm + +device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" +print(f"Using {device} device") + + +def predict(model, file_path): + trans = transforms.Compose([ + transforms.ToTensor(), + transforms.Grayscale() + ]) + with torch.no_grad(): + X = trans(Image.open(file_path)).reshape(1, 1, 60, 160) + X = X.to(device) + pred = model(X) + text = one_hot_encoding.decode(pred) + return text + + +def main(): + model = ConvNet().to(device) + model.load_state_dict(torch.load(f"./model.pth", weights_only=True)) + model.eval() + + correct = 0 + total = len(os.listdir(captcha_settings.TEST_DATASET_PATH)) + for filename in tqdm(os.listdir(captcha_settings.TEST_DATASET_PATH)): + file_path = f"{captcha_settings.TEST_DATASET_PATH}{os.path.sep}{filename}" + real_captcha = filename.split('.')[0].split('_')[-1] + pred_captcha = predict(model, file_path) + if pred_captcha == real_captcha: + correct += 1 + accuracy = f"Test {total} files, accuracy: {correct / total * 100:.2f}%" + print(accuracy) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..968bfa9 --- /dev/null +++ b/train.py @@ -0,0 +1,45 @@ +import torch +import torch.nn as nn + +from tqdm import tqdm + +from cnn_net import ConvNet +import dataset + +num_epochs = 10 +learning_rate = 0.001 + +device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" + +def main(): + model = ConvNet().to(device) + + model.train() + + criterion = nn.MultiLabelSoftMarginLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + # Train the model + train_dataloader = dataset.get_train_loader() + + for epoch in range(num_epochs): + print("Epoch:", epoch+1) + pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader)) + for i, (images, labels) in pbar: + images, labels = images.to(device), labels.to(device) + + predict_labels = model(images) + + loss = criterion(predict_labels, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + pbar.set_description("loss: %.4f" % loss.item()) + + print("loss:", loss.item(), '\n') + torch.save(model.state_dict(), "./model.pth") + +if __name__ == "__main__": + main()