import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import nltk
from nltk.corpus import gutenberg
# Create a list of book titles and authors from the Gutenberg corpus
books = [
{"title": "Moby Dick", "author": "Herman Melville"},
{"title": "Alice in Wonderland", "author": "Lewis Carroll"},
{"title": "Dracula", "author": "Bram Stoker"},
{"title": "Frankenstein", "author": "Mary Shelley"},
{"title": "Pride and Prejudice", "author": "Jane Austen"},
{"title": "The Odyssey", "author": "Homer"},
{"title": "Macbeth", "author": "William Shakespeare"},
{"title": "Hamlet", "author": "William Shakespeare"},
{"title": "Romeo and Juliet", "author": "William Shakespeare"},
{"title": "The Iliad", "author": "Homer"},
{"title": "Great Expectations", "author": "Charles Dickens"},
{"title": "A Tale of Two Cities", "author": "Charles Dickens"},
{"title": "Jane Eyre", "author": "Charlotte Brontë"},
{"title": "Wuthering Heights", "author": "Emily Brontë"},
{"title": "The Scarlet Letter", "author": "Nathaniel Hawthorne"}
]
# Convert the list of books to a pandas DataFrame
book_data = pd.DataFrame(books)
# Display basic info about the dataset
print("Dataset Information:")
print(book_data.info())
# Plot the number of books by author
top_recc_authors = book_data['author'].value_counts().head(15)
book_count = px.bar(top_recc_authors, x=top_recc_authors.values, y=top_recc_authors.index, orientation='h',
labels={'x': 'Book Count', 'y': 'Author'},
title='Books Count by Author')
book_count.show()
# Combine the title and author into a single string for each book
book_data['book_content'] = book_data['title'] + ' ' + book_data['author']
# Use Term Frequency-Inverse Document Frequency to vectorize the text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(book_data['book_content'])
# Calculate similarity between the computed vectors
vector_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
# Recommendation function for top three books
def generate_top_three_recommendations(book_title, vector_similarity=vector_similarity):
# Extract index of the book to find
book_index = book_data[book_data['title'] == book_title].index[0]
# Get the similarity score of all books with the specified book
similarity_scores = list(enumerate(vector_similarity[book_index]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
# Get top three recommended books (excluding the book itself)
top_three_scores = similarity_scores[1:4]
top_three_indices = [i[0] for i in top_three_scores]
return book_data['title'].iloc[top_three_indices]
# Example book title for top three recommendations
book = "Romeo and Juliet"
top_three_recommendations = generate_top_three_recommendations(book)
print(f"\nTop Three Books Recommended for '{book}':")
print(top_three_recommendations)