BERT_topic_analysis

Connect to kaggle

!pip install --user kaggle
!mkdir /root/.kaggle

!mv /content/kaggle.json /root/.kaggle/kaggle.json

!kaggle competitions download -c commonlitreadabilityprize

!ls

import os

for filename in os.listdir('.'):
    if filename.endswith('.zip'):
        os.system("unzip {}".format(filename))
        os.system("rm {}".format(filename))

Visualization with Bokeh

!pip install --upgrade pip
!pip install --upgrade numpy
!pip install --upgrade sentence_transformers
!conda install -c conda-forge hdbscan --y
!pip install bokeh
!pip install --upgrade bertopic[visualization]

# !pip uninstall numpy
# !pip install numpy

from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
import sklearn.manifold
import umap
import numpy as np
import pandas as pd
import random
from nltk.corpus import stopwords

random.seed(42)

from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma, d3, Turbo256
from bokeh.plotting import figure
from bokeh.transform import transform
import bokeh.io
bokeh.io.output_notebook()

import bokeh.plotting as bpl
import bokeh.models as bmo
bpl.output_notebook()

读取数据

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

train['set'] = 'train'
test['set'] = 'test'

combined = pd.concat([train, test], ignore_index=True)
combined.target.fillna(3, inplace=True)

texts = combined.excerpt.values.tolist()
targets = combined.target.values.tolist()
sets = combined.set.values.tolist()

文本处理（可选）

# def preprocess_tweet_data(data,name):
#     # Lowering the case of the words in the sentences
#     data[name]=data[name].str.lower()
#     # Code to remove the Hashtags from the text
#     data[name]=data[name].apply(lambda x:re.sub(r'\B#\S+','',x))
#     # Code to remove the links from the text
#     data[name]=data[name].apply(lambda x:re.sub(r"http\S+", "", x))
#     # Code to remove the Special characters from the text 
#     data[name]=data[name].apply(lambda x:' '.join(re.findall(r'\w+', x)))
#     # Code to substitute the multiple spaces with single spaces
#     data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
#     # Code to remove all the single characters in the text
#     data[name]=data[name].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
#     # Remove the twitter handlers
#     data[name]=data[name].apply(lambda x:re.sub('@[^\s]+','',x))


def preprocess(data):
    excerpt_processed=[]
    for e in data['excerpt']:
        # find alphabets
        e = re.sub("[^a-zA-Z]", " ", e)
        e = re.sub(r'\s+', ' ', e, flags=re.I)

        # # convert to lower case
        # e = e.lower()
        
        # tokenize words
        e = nltk.word_tokenize(e)
        # remove stopwords
        e = [word for word in e if not word.lower() in set(stopwords.words("english"))]
        # lemmatization
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e=" ".join(e)
        
        excerpt_processed.append(e)
        
    return excerpt_processed

使用sentence bert计算文档向量

1	`model = SentenceTransformer('stsb-distilbert-base')`

1	`embeddings = model.encode(texts)`

降维 t-SNE 与 umap

t-SNE保留数据中局部结构。
UMAP保留数据中的本地和大部分全局结构。
UMAP比tSNE要快得多，当面对更多数据、更高维数据时

1 2	`color_mapper = LinearColorMapper(palette='Plasma256', low=min(targets), high=max(targets)) out = sklearn.manifold.TSNE(n_components=2).fit_transform(embeddings)`

绘制Boken。test set标签设置为3。

SETS = ['train', 'test']
MARKERS = ['circle', 'triangle']

list_x = out[:,0]
list_y = out[:,1]
desc = texts

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, targets=targets, dset=sets))
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ("(x,y)", "(@x, @y)"),
    ('desc', '@desc'),
    ('targets', '@targets'),
    ('dset', '@dset')
])

p = figure(plot_width=800, plot_height=800, tools=[hover], title="First Look at the Data")
p.scatter('x', 'y', size=10, source=source, legend='dset', color={'field': 'targets', 'transform': color_mapper},
         marker=bokeh.transform.factor_mark('dset', MARKERS, SETS),)

bpl.show(p)

1 2	`umap_model = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine') out_umap = umap_model.fit_transform(embeddings)`

SETS = ['train', 'test']
MARKERS = ['circle', 'triangle']

list_x = out_umap[:,0]
list_y = out_umap[:,1]
desc = texts

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, targets=targets, dset=sets))
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ("(x,y)", "(@x, @y)"),
    ('desc', '@desc'),
    ('targets', '@targets'),
    ('dset', '@dset')
])

p = figure(plot_width=800, plot_height=800, tools=[hover], title="First Look at the Data")
p.scatter('x', 'y', size=10, source=source, legend='dset', color={'field': 'targets', 'transform': color_mapper},
         marker=bokeh.transform.factor_mark('dset', MARKERS, SETS),)

bpl.show(p)

Kmeans 观察数据

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 1)
    
    sse = []  # 轮廓系数
    for k in iters:
        cluster = MiniBatchKMeans(n_clusters=k, init_size=256, batch_size=512, random_state=20).fit(data)
        silhouette_avg = silhouette_score(data, cluster.labels_)
        sse.append(silhouette_avg)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')

colab

1	`find_optimal_clusters(embeddings, 20)`

1	`clusters_2 = MiniBatchKMeans(n_clusters=2, init_size=256, batch_size=512, random_state=20).fit_predict(embeddings)`

def plot_tsne_pca_umap(data, labels):
    max_label = max(labels)+1
    max_items = np.random.choice(range(data.shape[0]), size=2700, replace=False)
    
    # reducer = umap.UMAP(n_components=2)
    pca = PCA(n_components=2).fit_transform(data[max_items,:])
    tsne = sklearn.manifold.TSNE(n_components=2).fit_transform(embeddings)
    uma = umap.UMAP(n_components=2).fit_transform(embeddings)
    
    
    idx = np.random.choice(range(pca.shape[0]), size=320, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 3, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')
    
    ax[2].scatter(uma[idx,0],uma[idx,1],c=label_subset)
    ax[2].set_title('UMAP Cluster Plot')
    
plot_tsne_pca_umap(embeddings, clusters_2)

Topic

BERT topic

BERTopic依赖于句子嵌入和聚类算法，以及降维，生成文档主题簇。

1 2	`model = BERTopic(language="english", min_topic_size=20) topics, probs = model.fit_transform(texts)`

topic_words = ['-1: outlier']
for i in range(len(set(topics))-1):
  tpc = model.get_topic(i)[:8]
  words = [x[0] for x in tpc]
  tw = ' '.join([str(i) + ':'] + words)
  topic_words.append(tw)

exp_topics = [topic_words[x+1] for x in topics]

1	`len(set(topics))`

clrs = random.sample(Turbo256, len(set(topics)))
color_map = bmo.CategoricalColorMapper(factors=topic_words, palette=clrs)

list_x = out[:,0]
list_y = out[:,1]
desc = texts

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, topic=exp_topics, target=targets, dset=sets,))
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ('desc', '@desc'),
    ('topic', '@topic'),
    ('target', '@target'),
    ('dset', '@dset'),
])

p = figure(plot_width=800, plot_height=800, tools=[hover], title="Topics from BERTopic model")
p.scatter('x', 'y', size=10, source=source,
         fill_color=transform('topic', color_map),
         marker=bokeh.transform.factor_mark('dset', MARKERS, SETS),
         legend='dset'
)
# p.legend.location = "top_left"
# p.legend.click_policy="hide"

bokeh.plotting.show(p)

topic_df = model.get_topic_freq()

def get_keywords(i):
    if i == -1: return 'outlier'
    tpc = model.get_topic(i)[:8]
    words = [x[0] for x in tpc]
    tw = ' '.join(words)
    return tw

topic_df['keywords'] = topic_df['Topic'].apply(get_keywords)

topic_df

1	`model.get_topic(0)`

1	`model.visualize_topics()`

1	`# model.visualize_distribution(probs)`

Classic LDA

1	`import os`

1	`!pip install -Uqq gensim==3.8.3`

# import os       #importing os to set environment variable
# def install_java():
#   !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
#   os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
#   !java -version       #check java version
# install_java()

1 2	`!wget -q http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip !unzip -qq mallet-2.0.8.zip`

1 2	`os.environ['MALLET_HOME'] = '/content/mallet-2.0.8' mallet_path = '/content/mallet-2.0.8/bin/mallet' # you should NOT need to change this`

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.wrappers import LdaMallet
from gensim.models.coherencemodel import CoherenceModel
from gensim import similarities

import os.path
import re
import glob

import nltk
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

def preprocess_data(doc_set,extra_stopwords = {}):
    # adapted from https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
    # replace all newlines or multiple sequences of spaces with a standard space
    doc_set = [re.sub('\s+', ' ', doc) for doc in doc_set]
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # add any extra stopwords
    if (len(extra_stopwords) > 0):
        en_stop = en_stop.union(extra_stopwords)
    
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # add tokens to list
        texts.append(stopped_tokens)
    return texts

def prepare_corpus(doc_clean):
    # adapted from https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

1 2	`doc_clean = preprocess_data(texts,{}) dictionary, doc_term_matrix = prepare_corpus(doc_clean)`

1 2	`number_of_topics=30 # adjust this to alter the number of topics words=10 #adjust this to alter the number of words output for the topic below`

1	`ldamallet = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=number_of_topics, id2word=dictionary, alpha=10)`

# topic_words = ldamallet.show_topics(num_topics=number_of_topics,num_words=5)
# topic_words = [x[1] for x in topic_words]

topic_words = []
for i in range(number_of_topics):
  tpc = ldamallet.show_topic(i, topn=7, num_words=None)
  words = [x[0] for x in tpc]
  tw = ' '.join([str(i) + ':'] + words)
  topic_words.append(tw)

1	`topic_words`

# show result
topics_docs = list()
for m in ldamallet[doc_term_matrix[:1000]]:
    topics_docs.append(m)

x = np.array(topics_docs[:1000])
y = np.delete(x,0,axis=2)
y = y.squeeze()

best_topics = np.argmax(y, axis=1)  # 结果是一个分布
topics = list(best_topics)
topics = [topic_words[x] for x in topics]

clrs = random.sample(Turbo256, number_of_topics)
color_map = bmo.CategoricalColorMapper(factors=topic_words, palette=clrs)

list_x = out[:,0]
list_y = out[:,1]
desc = texts

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, topic=topics))
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ('desc', '@desc'),
    ('topic', '@topic')
])

p = figure(plot_width=1200, plot_height=600, tools=[hover], title="Test")
p.circle('x', 'y', size=10, source=source,
         fill_color=transform('topic', color_map),
         # legend='topic'
)
# p.legend.location = "top_left"
# p.legend.click_policy="hide"

bpl.show(p)

看上面的图表，由LDA重新识别的主题内文档不一定相互接近。与BERTopic是互补的，可以得到不同的主题表示。

Bertopic在短文本这类可能只有一个主题的文本中表现较好，而LDA可以更好地处理主题组合较多的文本。两者可以互补，因此尝试两者都有意义。

这和两者的原理是相关的，Bertopic是空间距离的聚类，LDA是统计层面的共现规律分析。

1 2	`# pyLDAvis可视化 !pip install -Uqq pyLDAvis==2.1.2`

1	`gensimmodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)`

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(gensimmodel, doc_term_matrix, dictionary)
p

ref: https://skok.ai/2021/05/27/Topic-Models-Introduction.html

BERTopic 详解

BERTopic，利用BERT嵌入和c-TF-IDF来创建密集的集群，使话题易于解释，同时在话题描述中保留重要词汇。其核心步骤主要是做三件事：

用基于BERT的Sentence Transformers提取语句嵌入
通过UMAP和HDBSCAN，将文档嵌入进行聚类，语义相近的语句将聚集成簇群
用c-TF-IDF提取主题词

c-TF-IDF就是将一个主题下的所有文档连接在一起成为一个文档，在主题间计算TF-IDF的方法。

import numpy as np
import pandas as pd
import jieba
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

# import sys
# sys.setrecursionlimit(1000000)

# model = SentenceTransformer(r'my_pretrained_chinese_embeddings')
# embeddings = model.encode(data['review'].tolist(), show_progress_bar=True)

#### 降维
umap_embeddings = umap.UMAP(
      n_neighbors=25,
      n_components=10,
      min_dist=0.00,
      metric='cosine',
      random_state=2020).fit_transform(embeddings)


#### 聚类
# 使用HDBSCAN来寻找高密簇
cluster = hdbscan.HDBSCAN(
      min_cluster_size=30,
      metric='euclidean',
      cluster_selection_method='eom', 
      prediction_data=True).fit(umap_embeddings)


#### c-TF-IDF
def c_tf_idf(documents, m, ngram_range=(1, 1)):
  my_stopwords =  [i.strip() for i in open('stop_words_zh.txt',encoding='utf-8').readlines()]

  count = CountVectorizer(
      ngram_range=ngram_range,                             
      stop_words= my_stopwords).fit(documents)

  t = count.transform(documents).toarray()

  w = t.sum(axis=1)
  tf = np.divide(t.T, w)
  
  sum_t = t.sum(axis=0)
  idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
  
  tf_idf = np.multiply(tf, idf)
  
  return tf_idf, count


####  主题归并
# 通过比较主题之间的c-TF-IDF向量，合并最相似的向量，最后重新计算c-TF-IDF向量来更新主题的表示

Notes NLP

BERT topic model

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

Practical BERT 上一篇

Jacobi BP整理下一篇