%matplotlib widget

import pandas as pd
from datasets import Dataset, load_from_disk, load_dataset

hf_dataset = load_from_disk("9_dataset_with_topics")
# hf_dataset = load_dataset("julep-ai/openai-community-posts")
df = hf_dataset.to_pandas()

hf_dataset.features

{'post_discussion_id': Value(dtype='int64', id=None),
 'post_discussion_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'post_discussion_title': Value(dtype='string', id=None),
 'post_discussion_created_at': Value(dtype='timestamp[ns, tz=UTC]', id=None),
 'post_category_id': Value(dtype='int64', id=None),
 'post_discussion_views': Value(dtype='int64', id=None),
 'post_discussion_reply_count': Value(dtype='int64', id=None),
 'post_discussion_like_count': Value(dtype='int64', id=None),
 'post_discussion_participant_count': Value(dtype='int64', id=None),
 'post_discussion_word_count': Value(dtype='float64', id=None),
 'post_id': Value(dtype='int64', id=None),
 'post_author': Value(dtype='string', id=None),
 'post_created_at': Value(dtype='string', id=None),
 'post_content': Value(dtype='string', id=None),
 'post_read_count': Value(dtype='int64', id=None),
 'post_reply_count': Value(dtype='int64', id=None),
 'post_author_id': Value(dtype='int64', id=None),
 'post_number': Value(dtype='int64', id=None),
 'post_discussion_related_topics': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'accepted_answer_post': Value(dtype='float64', id=None),
 'post_content_raw': Value(dtype='string', id=None),
 'post_category_name': Value(dtype='string', id=None),
 'post_sentiment': Value(dtype='string', id=None),
 'post_sentiment_score': Value(dtype='float64', id=None),
 'post_content_cluster_embedding': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'post_content_classification_embedding': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'post_content_search_document_embedding': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'tag1': Value(dtype='string', id=None),
 'tag2': Value(dtype='string', id=None),
 'tag3': Value(dtype='string', id=None),
 'tag4': Value(dtype='string', id=None),
 'post_discussion_url': Value(dtype='string', id=None),
 'post_url': Value(dtype='string', id=None),
 'topic_model_medium': Value(dtype='string', id=None),
 'topic_model_broad': Value(dtype='string', id=None)}

# Total number of posts
print("Total number of posts: ", len(df))

# Total discussions
print("Total discussions: ", len(df["post_discussion_id"].unique()))

# Total number of users
print("Total number of users: ", len(df["post_author_id"].unique()))

Total number of posts:  97033
Total discussions:  18990
Total number of users:  21419

# Earliest and latest post
print("Earliest post: ", df["post_created_at"].min())
print("Latest post: ", df["post_created_at"].max())

Earliest post:  2021-03-10T20:39:25.848Z
Latest post:  2024-02-27T14:03:01.685Z

df["post_sentiment"].value_counts(ascending=True, normalize=True)

post_sentiment
negative    0.185277
positive    0.219327
neutral     0.595395
Name: proportion, dtype: float64

# Group by 'post_category_name' and then apply normalized value_counts to 'post_sentiment'
sentiment_percentages = df.groupby("post_category_name")["post_sentiment"].apply(
    lambda x: x.value_counts(normalize=True)
)

# Convert the Series to a DataFrame and reset the index
# sentiment_percentages = sentiment_percentages.mul(
#     100
# )  # Convert fractions to percentages
sentiment_percentages = sentiment_percentages.reset_index(name="percentage")

# Pivot the table for better readability
pivot_df = sentiment_percentages.pivot(
    index="post_category_name", columns="level_1", values="percentage"
)

# Fill NaN values with zero if any sentiment labels are missing in a category
pivot_df = pivot_df.fillna(0)

pivot_df.reset_index()

pivot_df.columns.rename(None, inplace=True)

# Display the pivoted DataFrame in descending order
pivot_df

import matplotlib.pyplot as plt
import seaborn as sns

df["post_content_raw_length"] = df["post_content_raw"].apply(len)

plt.figure(figsize=(12, 6))
sns.histplot(
    df["post_content_raw_length"], bins=100, kde=False, cumulative=True, stat="density"
)
plt.title("CDF of Length Distribution of post_content_raw")
plt.xlabel("Length of post_content_raw")
plt.ylabel("Cumulative Density")
plt.show()

/home/glitch/.conda/envs/julep/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"

df[
    [
        "post_content_cluster_embedding",
        "post_content_classification_embedding",
        "post_content_search_document_embedding",
    ]
]

from nomic import atlas, AtlasDataset
from IPython.core.display import HTML

dataset = AtlasDataset(identifier="glitch/openai-community-posts---clustering---v2")

2024-03-20 16:22:42.072 | INFO     | nomic.dataset:__init__:779 - Loading existing dataset `glitch/openai-community-posts---clustering---v2``.

HTML(dataset.maps[0]._embed_html())

map = dataset.maps[0]

neighbors, distances = map.embeddings.vector_search(ids=["Fjk"], k=7)
similar_datapoints = dataset.get_data(ids=neighbors[0])
for i, point in enumerate(similar_datapoints):
    if i == 0:
        print("Initial point:", point.get("post_discussion_title"), "\n")
        print("Nearest neighbors:")
    else:
        print(point.get("post_discussion_title"))

Initial point: Gpt-4-1106-preview messes up function call parameters encoding 

Nearest neighbors:
Gpt-4-1106-preview messes up function call parameters encoding
When structuring the output of function calls, there is Chinese character encoding issue resulting in garbled text
Confused on models that have function calling and when they get deprecated
Gpt-4-1106-preview is not generating utf-8
Gpt-3.5-turbo-1106 Calls multiple of the same function unecessarily
There is a mistake on the doc page of function calling

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns

print("Total features:", df.columns.__len__())

Total features: 36

og_df = df.copy(deep=True)

# Identify columns that contain lists, arrays or strings
cols_to_exclude = [
    col
    for col in df.columns
    if df[col].apply(lambda x: isinstance(x, (list, np.ndarray, str))).any()
]

cols_to_exclude.extend(
    [
        "post_discussion_id",
        "post_category_id",
        "post_id",
        "post_author_id",
    ]
)

# Create a new DataFrame that only includes columns with single numerical values
df_numerical = df.drop(columns=cols_to_exclude)

# Calculate the correlation matrix
corr = df_numerical.corr()

# Plot the heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(corr, annot=False, cmap="coolwarm")
plt.xticks(rotation=45)  # Rotate x-axis labels
plt.tight_layout()  # Adjust plot margins
plt.show()

# Count sentiment labels by month
df["post_created_at"] = pd.to_datetime(df["post_created_at"])
df["year_month"] = df["post_created_at"].dt.to_period("M")

# Count sentiment labels by month
colors = {"negative": "red", "neutral": "blue", "positive": "green"}
sentiment_label_counts_by_month = (
    df.groupby(["year_month", "post_sentiment"]).size().unstack(fill_value=0)
)

# Calculate proportions of sentiment labels by month
total_posts_per_month = sentiment_label_counts_by_month.sum(axis=1)
sentiment_label_proportions_by_month = sentiment_label_counts_by_month.divide(
    total_posts_per_month, axis=0
)

sentiment_label_counts_by_month.plot(
    kind="bar",
    stacked=True,
    figsize=(14, 8),
    color=[colors[col] for col in sentiment_label_counts_by_month.columns],
)
plt.title("Volume of Posts Over Time")
plt.xlabel("Month")
plt.ylabel("Number of Posts")
plt.xticks(rotation=45)
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()

/tmp/ipykernel_186231/1065315333.py:3: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  df["year_month"] = df["post_created_at"].dt.to_period("M")

df["post_created_at"] = pd.to_datetime(df["post_created_at"])

# Set the 'post_created_at' column as the index
df.set_index("post_created_at", inplace=True)

monthly_sentiment = (
    df.resample("ME")["post_sentiment"].value_counts().unstack(fill_value=0)
)
# Plotting
plt.figure(figsize=(15, 8))
plt.plot(monthly_sentiment.index, monthly_sentiment.values)

for sentiment in monthly_sentiment.columns:
    plt.plot(
        monthly_sentiment.index, monthly_sentiment[sentiment], color=colors[sentiment]
    )


# Formatting the x-axis to show Month-Year
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%b-%Y"))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())

# Improve x-axis labels readability
plt.gcf().autofmt_xdate()

plt.title("Average Sentiment Over Time")
plt.xlabel("Time")
plt.ylabel("Number of Posts")
plt.grid(True)
plt.show()

aggregated_data = df.resample("ME", on="post_discussion_created_at").agg(
    {
        "post_discussion_views": "sum",
        "post_discussion_like_count": "sum",
        "post_discussion_reply_count": "sum",
    }
)

fig, ax1 = plt.subplots(figsize=(15, 7))

color = "tab:red"
ax1.set_xlabel("Time")
ax1.set_ylabel("Views", color=color)
ax1.plot(aggregated_data.index, aggregated_data["post_discussion_views"], color=color)
ax1.tick_params(axis="y", labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
color = "tab:blue"
ax2.set_ylabel(
    "Likes and Replies", color=color
)  # we already handled the x-label with ax1
ax2.plot(
    aggregated_data.index,
    aggregated_data["post_discussion_like_count"],
    color="blue",
    label="Likes",
)
ax2.plot(
    aggregated_data.index,
    aggregated_data["post_discussion_reply_count"],
    color="green",
    label="Replies",
)
ax2.tick_params(axis="y", labelcolor=color)

# Add a horizontal line and a label at November 2023
dev_day = mdates.date2num(
    pd.to_datetime("2023-11-06")
)  # Convert the date to matplotlib's internal format

gpt4_launch = mdates.date2num(pd.to_datetime("2023-03-14"))

ax2.axvline(dev_day, color="black", linestyle="--")  # Add a vertical line
ax2.axvline(gpt4_launch, color="black", linestyle="--")  # Add a vertical line
ax2.text(
    dev_day,
    ax2.get_ylim()[1],
    "OpenAI Dev Day",
    horizontalalignment="left",
    verticalalignment="top",
)  # Add a label

ax2.text(
    gpt4_launch,
    ax2.get_ylim()[1],
    "GPT-4 Launch",
    horizontalalignment="left",
    verticalalignment="top",
)  # Add a label

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.legend(loc="upper left")
plt.show()

df = og_df.copy(deep=True)

df["post_created_at"] = pd.to_datetime(df["post_created_at"])

# Assign numeric values to sentiment labels.
sentiment_numeric = {"negative": -1, "neutral": 0, "positive": 1}
df["sentiment_numeric"] = df["post_sentiment"].map(sentiment_numeric)

# Calculate weighted sentiment score.
df["weighted_sentiment"] = df["sentiment_numeric"] * df["post_sentiment_score"]

# Group by topic and month, then calculate the average weighted sentiment.
df["year_month"] = df["post_created_at"].dt.to_period("M")
grouped = (
    df.groupby(["topic_model_broad", "year_month"])["weighted_sentiment"]
    .mean()
    .reset_index()
)

# Pivot for easier plotting.
pivot_table = grouped.pivot(
    index="year_month", columns="topic_model_broad", values="weighted_sentiment"
)


df["adjusted_weight"] = df.apply(
    lambda row: (
        row["post_sentiment_score"] * 1.5
        if row["sentiment_numeric"] != 0
        else row["post_sentiment_score"]
    ),
    axis=1,
)

# Calculate weighted sentiment score using the adjusted weights.
df["weighted_sentiment_adjusted"] = df["sentiment_numeric"] * df["adjusted_weight"]

# Group by topic and month, then calculate the average adjusted weighted sentiment.
df["year_month"] = df["post_created_at"].dt.to_period("M")
grouped_adjusted = (
    df.groupby(["topic_model_broad", "year_month"])["weighted_sentiment_adjusted"]
    .mean()
    .reset_index()
)

# Pivot for easier plotting.
pivot_table_adjusted = grouped_adjusted.pivot(
    index="year_month",
    columns="topic_model_broad",
    values="weighted_sentiment_adjusted",
)

# Plotting
df["post_created_at"] = pd.to_datetime(df["post_created_at"])

# Assign numeric values to sentiment labels.
sentiment_numeric = {"negative": -1, "neutral": 0, "positive": 1}
df["sentiment_numeric"] = df["post_sentiment"].map(sentiment_numeric)

# Calculate weighted sentiment score.
df["weighted_sentiment"] = df["sentiment_numeric"] * df["post_sentiment_score"]

# Group by topic and month, then calculate the average weighted sentiment.
df["year_month"] = df["post_created_at"].dt.to_period("M")
grouped = (
    df.groupby(["topic_model_broad", "year_month"])["weighted_sentiment"]
    .mean()
    .reset_index()
)

# Pivot for easier plotting.
pivot_table = grouped.pivot(
    index="year_month", columns="topic_model_broad", values="weighted_sentiment"
)


df["adjusted_weight"] = df.apply(
    lambda row: (
        row["post_sentiment_score"] * 1.5
        if row["sentiment_numeric"] != 0
        else row["post_sentiment_score"]
    ),
    axis=1,
)

# Calculate weighted sentiment score using the adjusted weights.
df["weighted_sentiment_adjusted"] = df["sentiment_numeric"] * df["adjusted_weight"]

# Group by topic and month, then calculate the average adjusted weighted sentiment.
df["year_month"] = df["post_created_at"].dt.to_period("M")
grouped_adjusted = (
    df.groupby(["topic_model_broad", "year_month"])["weighted_sentiment_adjusted"]
    .mean()
    .reset_index()
)

# Pivot for easier plotting.
pivot_table_adjusted = grouped_adjusted.pivot(
    index="year_month",
    columns="topic_model_broad",
    values="weighted_sentiment_adjusted",
)

# Plotting
plt.figure(figsize=(14, 8))
pivot_table_adjusted.index = pivot_table_adjusted.index.to_timestamp()

pivot_table_adjusted.drop(["Emoji (8)"], axis=1, inplace=True)

for column in pivot_table_adjusted.columns:
    clean_series_adjusted = pivot_table_adjusted[column].dropna()
    plt.plot(
        clean_series_adjusted.index,
        clean_series_adjusted,
        marker="",
        linewidth=2,
        label=column,
    )

# Add horizontal bands
plt.fill_between(clean_series_adjusted.index, -0.1, 0.1, color="blue", alpha=0.1)
plt.fill_between(clean_series_adjusted.index, 0.1, 1.0, color="green", alpha=0.1)
plt.fill_between(clean_series_adjusted.index, -1.0, -0.1, color="red", alpha=0.1)

plt.title("Average Adjusted Weighted Sentiment Score Over Time by Topic Model")
plt.xlabel("Time")
plt.ylabel("Average Adjusted Weighted Sentiment Score")
plt.legend(title="Topic Model", loc="best")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

/tmp/ipykernel_186231/2422921945.py:11: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  df["year_month"] = df["post_created_at"].dt.to_period("M")
/tmp/ipykernel_186231/2422921945.py:37: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  df["year_month"] = df["post_created_at"].dt.to_period("M")
/tmp/ipykernel_186231/2422921945.py:62: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  df["year_month"] = df["post_created_at"].dt.to_period("M")
/tmp/ipykernel_186231/2422921945.py:88: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  df["year_month"] = df["post_created_at"].dt.to_period("M")

# length of the topic model
print("Length of medium topic model:", len(df["topic_model_medium"].unique()))
print("Length of broad topic model:", len(df["topic_model_broad"].unique()))

Length of medium topic model: 256
Length of broad topic model: 8

selected_topics = [
    "Assistant",
    "API Usage",
    "Chatbot",
    "Python3 Packages",
    "API Development",
    "Performance",
    "GPUs, Performance, Compute",
    "Pricing",
    "Retrying",
    "Product Development",
    "Threads",
    "Embeddings",
    "Parallel Methods",
    "Schema",
    "Knowledge Retrieval",
    "OpenAI",
    "Assistants, API, Platform",
    "JSON Format",
    "JSON Format (2)",
    "Embeddings (2)",
    "APIs",
    "Error Handling",
    "Embeddings (3)",
    "Streaming",
    "Invalid Request Error",
    "Completion",
    "AI Assistants",
    "JSON Validation",
    "Conversation History",
    "Assistant Tools",
    "Functions",
    "Functions (2)",
    "Functions (3)",
    "Functions (4)",
    "Threads (2)",
    "User Assistant",
    "API",
    "Vector Space Search",
    "Summarization",
    "Embeddings (5)",
    "Logit",
    "Embedding Vectors",
    "GPTs",
    "Functions",
    "Embedding",
    "API Development",
    "AI Development",
    "Assistants",
]


topic_df = df[df["topic_model_medium"].isin(selected_topics)].copy(deep=True)

selected_topics.__len__()

48

# view rows where topic model is in selected topics, post_sentiment is negative and ordered by post_discussion_views
topic_df[
    (topic_df["post_sentiment"] == "negative") & (topic_df["post_discussion_views"] > 0)
].drop_duplicates(subset="post_discussion_id").sort_values(
    "post_discussion_views", ascending=False
)[
    [
        "post_discussion_title",
        "post_content_raw",
        "topic_model_medium",
        "post_discussion_views",
        "post_sentiment",
        "post_discussion_url",
    ]
]

# add a column for the length of the post_content_raw
topic_df["post_content_raw_length"] = topic_df["post_content_raw"].apply(len)

# Filter the dataframe to include only posts with negative sentiment
negative_posts = topic_df[topic_df["post_sentiment"] == "negative"]

# Group the negative posts by topic and count the number of posts in each topic
negative_posts_by_topic = (
    negative_posts.groupby("topic_model_medium").size().reset_index(name="count")
)

# Sort the topics by the count of negative posts in descending order
negative_posts_by_topic = negative_posts_by_topic.sort_values("count", ascending=False)

# Plot the bar chart
plt.figure(figsize=(12, 6))
sns.barplot(
    x="count", y="topic_model_medium", data=negative_posts_by_topic, color="red"
)

plt.title("Posts with Most Negative Sentiment by Topic")
plt.xlabel("Number of Posts")
plt.ylabel("Topic")
plt.tight_layout()
plt.show()

sentiment_percentages = df["post_sentiment"].value_counts(normalize=True) * 100
print(sentiment_percentages)

post_sentiment
neutral     59.539538
positive    21.932745
negative    18.527717
Name: proportion, dtype: float64

# Group the posts by topic and sentiment, and count the number of posts in each group
posts_by_topic_and_sentiment = (
    topic_df.groupby(["topic_model_medium", "post_sentiment"])
    .size()
    .reset_index(name="count")
)

colors = {"negative": "red", "neutral": "blue", "positive": "green"}
# Calculate the total number of posts for each topic
total_posts_by_topic = topic_df.groupby("topic_model_medium").size()

# Calculate the percentage of posts for each sentiment within each topic
posts_by_topic_and_sentiment["percentage"] = posts_by_topic_and_sentiment.apply(
    lambda row: row["count"] / total_posts_by_topic[row["topic_model_medium"]] * 100,
    axis=1,
)

# Pivot the DataFrame to get the percentage of posts for each sentiment as separate columns
pivot_df = posts_by_topic_and_sentiment.pivot(
    index="topic_model_medium", columns="post_sentiment", values="percentage"
).fillna(0)

pivot_df = pivot_df.sort_values("negative", ascending=False)

# Plot the stacked bar chart
pivot_df.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 6),
    color=[colors[col] for col in pivot_df.columns],
)

plt.title("Percentage of Posts by Sentiment and Topic")
plt.xlabel("Topic")
plt.ylabel("Percentage of Posts")
plt.legend(title="Sentiment")
# plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

	negative	neutral	positive
post_category_name
api	0.188675	0.624195	0.187131
api/bugs	0.376378	0.533858	0.089764
api/deprecations	0.161049	0.662921	0.176030
api/feedback	0.261770	0.553672	0.184557
community	0.137866	0.502298	0.359837
documentation	0.137372	0.559727	0.302901
gpts-builders	0.260511	0.597313	0.142176
gpts-builders/chat-plugins	0.232624	0.538543	0.228833
gpts-builders/plugin-store	0.187500	0.506944	0.305556
prompting	0.133054	0.633530	0.233416

	post_content_cluster_embedding	post_content_classification_embedding	post_content_search_document_embedding
0	[-0.011406975, 0.051801503, -0.17560473, -0.02...	[-0.029118164, 0.031905293, -0.1484513, -0.015...	[-0.02271616, 0.033384237, -0.15369709, -0.017...
1	[0.075202845, 0.039509684, -0.21858266, -0.039...	[0.054288954, 0.008444463, -0.20109606, -0.042...	[0.06603962, 0.0113762, -0.15860154, -0.031844...
2	[0.081624806, 0.051376425, -0.21687175, -0.017...	[0.073775776, 0.034063034, -0.19114846, -0.007...	[0.049514644, 0.026369868, -0.17453438, -0.011...
3	[0.04684566, 0.07910612, -0.2271005, -0.007859...	[0.013259176, 0.015849816, -0.22634435, -0.022...	[0.012498306, -0.00900329, -0.092770934, 0.007...
4	[-0.016075207, 0.10314193, -0.22071771, -0.024...	[-0.034080368, 0.09957978, -0.20546404, -0.018...	[-0.015658986, 0.071472555, -0.19949938, -0.00...
...	...	...	...
97028	[0.04684566, 0.07910612, -0.2271005, -0.007859...	[0.013259176, 0.015849816, -0.22634435, -0.022...	[0.012498306, -0.00900329, -0.092770934, 0.007...
97029	[0.032625105, 0.052557576, -0.15643555, -0.055...	[0.019226272, 0.02287624, -0.1287021, -0.05793...	[0.012006246, 0.022498403, -0.10844656, -0.033...
97030	[0.01553116, 0.03656999, -0.15440144, -0.06329...	[-0.0005156845, 0.011319388, -0.11510259, -0.0...	[0.0049003367, 0.009971226, -0.12864526, -0.05...
97031	[0.03986051, 0.048007715, -0.17821708, -0.0489...	[0.0199232, 0.019354336, -0.14687058, -0.04700...	[0.016168084, 0.03449353, -0.16987395, -0.0337...
97032	[0.04684566, 0.07910612, -0.2271005, -0.007859...	[0.013259176, 0.015849816, -0.22634435, -0.022...	[0.012498306, -0.00900329, -0.092770934, 0.007...

	post_discussion_title	post_content_raw	topic_model_medium	post_discussion_views	post_sentiment	post_discussion_url
11898	Cheat Sheet: Mastering Temperature and Top_p i...	Presence\_penalty was only mentioned once as a...	Logit	113139	negative	https://community.openai.com/t/172683
11179	OpenAI API keys in free account	I have genertaed openai api keys in the free a...	API Usage	89960	negative	https://community.openai.com/t/348972
64217	Your account was flagged for potential abuse	Same problem. And I can’t write anything in [h...	OpenAI	87260	negative	https://community.openai.com/t/156597
5484	Getting response data as a fixed & Consistent ...	I have such common issue where JSON data is no...	JSON Format	78668	negative	https://community.openai.com/t/28471
34098	Is chat GPT provided for free	I have only $5 \n\nWhat are you jealous of?\n\n	Pricing	75918	negative	https://community.openai.com/t/86249
...	...	...	...	...	...	...
194	Anyone experiences no message response using a...	Hi, I just encountered very strange behavior w...	Assistant Tools	53	negative	https://community.openai.com/t/656254
1485	Request always fail after the first function c...	I am using model gpt3.5-turbo with chat comple...	Python3 Packages	53	negative	https://community.openai.com/t/647723
839	Help with using openAI Assistants via API in J...	I’m trying to create a simple chrome plugin th...	Error Handling	53	negative	https://community.openai.com/t/652635
223	Assistant API: Empty message is generated with...	Hi whung,\n\n\nI think there might me some err...	Assistants, API, Platform	52	negative	https://community.openai.com/t/656170
77670	OpenAPI spec can have a maximum of 30 operations	I am building an action for an API with lots o...	Python3 Packages	31	negative	https://community.openai.com/t/586484

Anatomy of OpenAI's Developer Community¶

But... why?¶

Getting data from Discourse¶

Feature Engineering¶

Sentiment¶

Vector Embeddings¶

Visualising¶

Vector Search¶

Preliminary Data Analysis¶

Feedback, Complaints and Requests¶

Wrapping up¶