首頁  >  文章  >  後端開發  >  創建聊天機器人 - JO PARIS 4

創建聊天機器人 - JO PARIS 4

PHPz
PHPz原創
2024-08-27 06:04:32633瀏覽

Create chat bot - JO PARIS 4

在本文中,我將展示如何使用 Tensorflow 建立一個簡單的聊天機器人。

對於數據,我使用 PARIS JO JO 2024 的 kaggle 資料集來取得訓練階段的句子。

您可以在我的github中取得完成程式碼:https://github.com/victordalet/Kaggle_analysis/tree/feat/paris_2024_olympics


I - 預設聊天機器人資料集

聊天機器人上的張量流資料集如下所示。
我們可以找到標籤、模式和各種回應。
我們的目標是從 JO 投注資料集中添加不同的序列,並將它們添加到這樣的檔案中。

{
  "intents": [
    {
      "tag": "google",
      "patterns": [
        "google",
        "search",
        "internet"
      ],
      "responses": [
        "Redirecting to Google..."
      ]
    },

II - 資料處理

我讀取了預設 json 和 JO 的 csv 中的聊天機器人資料集,並將其分割並處理以在 json 中添加句子

import json


class CreateDataset:
    def __init__(self):
        self.json_path = 'data.json'
        self.csv_path = '../paris-2024-faq.csv'
        with open(self.json_path) as file:
            self.dataset = json.load(file)
        f = open(self.csv_path, 'r')
        dataset_split = f.read().split(";")
        question = False
        for data in dataset_split:
            if question:
                question = False
                self.dataset["intents"][-1]["responses"].append(data)

            if "?" in data:
                question = True
                self.dataset["intents"].append({
                    "tag": "",
                    "patterns": [
                        data
                    ],
                    "responses": [
                    ]
                })
        with open(self.json_path, 'w') as f:
            json.dump(self.dataset, f)

三、培訓

出於訓練目的,我編輯了一個張量流範例。
如果您使用我的程式碼來運行它,請在第一個參數中新增您想要的紀元數。
建立模型所在的保存目錄,然後新增 github 中的classes.pkl 和words.pkl 文件,如本文開頭所示。

import random
import json
import pickle
import numpy as np
import sys

import nltk
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD


class Train:
    words: list
    classes: list
    documents: list
    ignore_letters: list
    training: list
    output_empty: list
    train_x: list
    train_y: list
    model: Sequential
    epochs: int

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.intents = json.loads(open('data.json').read())
        self.words = []
        self.classes = []
        self.documents = []
        self.training = []
        self.ignore_letters = ['?', '!']
        self.epochs = int(sys.argv[1])

    def run(self):
        self.download_nltk_data()
        self.load_training_data()
        self.prepare_training_data()
        self.build_neural_network()
        self.train()

    @staticmethod
    def download_nltk_data():
        nltk.download('punkt')
        nltk.download('wordnet')

    def load_training_data(self):
        for intent in self.intents['intents']:
            for pattern in intent['patterns']:
                word_list = nltk.word_tokenize(pattern)
                self.words.extend(word_list)
                self.documents.append((word_list, intent['tag']))
                if intent['tag'] not in self.classes:
                    self.classes.append(intent['tag'])

    def prepare_training_data(self):
        self.words = [self.lemmatizer.lemmatize(word)
                      for word in self.words
                      if word not in self.ignore_letters]

        self.words = sorted(set(self.words))
        self.classes = sorted(set(self.classes))
        pickle.dump(self.words, open('saves/words.pkl', 'wb'))
        pickle.dump(self.classes, open('saves/classes.pkl', 'wb'))

        self.output_empty = [0] * len(self.classes)
        for document in self.documents:
            bag = []
            word_patterns = document[0]
            word_patterns = [self.lemmatizer.lemmatize(word.lower())
                             for word in word_patterns]
            for word in self.words:
                bag.append(1) if word in word_patterns else bag.append(0)

            output_row = list(self.output_empty)
            output_row[self.classes.index(document[1])] = 1
            self.training.append([bag, output_row])

        random.shuffle(self.training)
        self.training = np.array(self.training)

        self.train_x = list(self.training[:, 0])
        self.train_y = list(self.training[:, 1])

    def build_neural_network(self):
        self.model = Sequential()
        self.model.add(Dense(128, input_shape=(len(self.train_x[0]),),
                             activation='relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(len(self.train_y[0]), activation='softmax'))

        sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)
        self.model.compile(loss='categorical_crossentropy',
                           optimizer=sgd,
                           metrics=['accuracy'])

    def train(self):
        self.model.fit(np.array(self.train_x),
                       np.array(self.train_y),
                       epochs=self.epochs,
                       batch_size=5,
                       verbose=1)
        self.model.save('saves/chatbot_model.model')


if __name__ == "__main__":
    Train().run()

IV - 測試

我建立了一個 ChatBot 類,其中包含接受隨機訊息的測試方法。
您可以使用 get_response 方法將此聊天機器人新增至您的應用程式中,例如,我在 Flask api 的一個專案中呼叫它,以便將我的聊天機器人新增至網站。

import random
import json
import pickle
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model


class ChatBot:
    lemmatizer: WordNetLemmatizer
    intents: dict
    words: list
    classes: list
    model: load_model
    ERROR_THRESHOLD = 0.25

    def __init__(self):
        self.download_nltk_data()
        self.lemmatizer = WordNetLemmatizer()
        self.intents = json.loads(open('data.json').read())
        self.words = pickle.load(open('saves/words.pkl', 'rb'))
        self.classes = pickle.load(open('saves/classes.pkl', 'rb'))
        self.model = load_model('saves/chatbot_model.model')

    @staticmethod
    def download_nltk_data():
        nltk.download('punkt')
        nltk.download('wordnet')

    def clean_up_sentence(self, sentence):
        sentence_words = nltk.word_tokenize(sentence)
        sentence_words = [self.lemmatizer.lemmatize(word)
                          for word in sentence_words]
        return sentence_words

    def bag_of_words(self, sentence):
        sentence_words = self.clean_up_sentence(sentence)
        bag = [0] * len(self.words)
        for w in sentence_words:
            for i, word in enumerate(self.words):
                if word == w:
                    bag[i] = 1
        return np.array(bag)

    def predict_class(self, sentence):
        bow = self.bag_of_words(sentence)
        res = self.model.predict(np.array([bow]))[0]
        results = [[i, r]
                   for i, r in enumerate(res)
                   if r > self.ERROR_THRESHOLD]
        results.sort(key=lambda x: x[1], reverse=True)
        return_list = []
        for r in results:
            return_list.append({'intent': self.classes[r[0]],
                                'probability': str(r[1])})
        return return_list

    def get_response(self, intents_list):
        intents_json = self.intents
        tag = intents_list[0]['intent']
        list_of_intents = intents_json['intents']
        for i in list_of_intents:
            if i['tag'] == tag:
                result = random.choice(i['responses'])
                break
        return result

    def test(self):
        while True:
            message = input("")
            ints = self.predict_class(message)
            res = self.get_response(ints)
            print(res)

以上是創建聊天機器人 - JO PARIS 4的詳細內容。更多資訊請關注PHP中文網其他相關文章!

陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn