import pandas as pd
import numpy as np
import re
from bokeh.charts import Bar, Scatter, output_notebook, show, output_file
from bokeh.charts.attributes import CatAttr, color
from bokeh.models import HoverTool, Range1d, Span, LabelSet, ColumnDataSource, Title, NumeralTickFormatter
from bokeh.plotting import figure
import matplotlib.pyplot as plt
counts_file = "counts.csv"
df_counts = pd.read_csv(counts_file)
df_counts = df_counts.dropna(subset = ['term'])
df_counts['CNN'] = (df_counts['CNN'] / df_counts['CNN'].sum()) * 100
df_counts['Fox'] = (df_counts['Fox'] / df_counts['Fox'].sum()) * 100
df_counts[:2]
df_counts = pd.melt(df_counts, id_vars = 'term', var_name = 'site', value_name = 'term_pct')
df_counts[:2]
TOP_NUMBER = 5
top_CNN = df_counts[df_counts['site'] == 'CNN'].sort_values(by = 'term_pct', ascending = False)[: TOP_NUMBER]
top_Fox = df_counts[df_counts['site'] == 'Fox'].sort_values(by = 'term_pct', ascending = False)[: TOP_NUMBER]
top_CNN_term = top_CNN['term'].tolist()
top_Fox_terms = top_Fox['term'].tolist()
top_terms = list(set(top_CNN_term + top_Fox_terms))
top_terms
plot_data = df_counts.loc[df_counts['term'].isin(top_terms)]
plot_data['term'] = plot_data['term'].str.title()
# Let's plot this with Bokeh, making an HTML file
p = Bar(plot_data, label=CatAttr(columns=['term'], sort=True), values='term_pct',
group = "site", legend = "top_right", tools="previewsave", height=600, width=900,
title="Top Terms for CNN and Fox", xlabel="Term", ylabel="Percentage of Terms")
# Fix bar width issue
for r in p.renderers:
try:
r.glyph.width = 0.33
except AttributeError:
pass
msg = """Note: Data are from CNN.com and Foxnews.com. Common and one-letter words have been excluded."""
caption = Title(text=msg, align='left', text_font_size='8pt')
p.add_layout(caption, 'below')
output_file("term_pct.html")
show(p)
# We can make a similar plot using Matplotlib (ggplot is buggy), producing a PNG image
%matplotlib inline
plot_data = plot_data.sort_values(by = 'term')
cnn_data = plot_data.loc[plot_data['site'] == 'CNN']
fox_data = plot_data.loc[plot_data['site'] == 'Fox']
cnn = cnn_data['term_pct'].tolist()
fox = fox_data['term_pct'].tolist()
ind = np.arange(len(cnn))
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(ind, cnn, width, color='r')
rects2 = ax.bar(ind + width, fox, width, color='y')
# add some text for labels, title and axes ticks
ax.set_title('Term Frequency by News Source', fontsize = 10)
ax.set_ylabel('Percentage of Terms', fontsize = 8)
ax.set_xticks(ind + width)
ax.set_xticklabels(tuple(cnn_data['term'].tolist()), fontsize = 4, rotation = 45)
ax.legend((rects1[0], rects2[0]), ('CNN', 'Fox'), prop={'size':6})
fig.savefig('term_pct.png', dpi = 250)