Law Firm Website Colors¶
Goal¶
To understand the distribution of colors on law firm websites. However, there are a number of ways that color distribution could be analyzed, including:
- For any given site, the bucketed priority order of colors (i.e.
blue is most prevalent, followed by red, followed by black
). - For any given site, a histogram of bucketed colors (i.e.
page is 50% blue, 30% red, 20% black
) - The blended histogram across all sites under test (i.e.
ecosystem s 50% blue, 30% red, 20% black
).
What I think is most interesting would be to understand the answers to questions like:
- What color is the primary color for the majority of law firm websites?
- What is the ranking of primary color for law firm websites (highest to lowest popularity)?
- What colors are most commonly used together on law firm websites?
- What colors are most common for firms as grouped by their primary practice area? In other words, do personal injury firms share common colors and criminal defense tend to lean towards different color combinations?
- If we look at a state (like Florida), do the common colors differ from the national average?
# get a DB set up so we don't have to perform expensive work more than once!
!pip install sqlalchemy
from sqlalchemy import create_engine, MetaData, Table, Column, Numeric, String, Boolean, select
from sqlalchemy.orm import registry, Session
from sqlalchemy import func
import csv
engine = create_engine('sqlite:///law_firm_websites.sqlite')
connection = engine.connect()
mapper_registry = registry()
metadata_obj = MetaData()
websites_table = Table(
"websites",
metadata_obj,
Column("domain", String(120), primary_key=True),
Column("firm_name", String(100), nullable=False),
Column("primary_state", String(100), nullable=True),
Column("firm_primary_practice_area", String(100), nullable=True),
# calculated columns
Column("has_had_frequencies_calculated", Boolean, default=False),
Column("has_frequency_calculation_failure", Boolean, default=False),
Column("red_frequency", Numeric(5, 4), nullable=True),
Column("orange_frequency", Numeric(5, 4), nullable=True),
Column("yellow_frequency", Numeric(5, 4), nullable=True),
Column("green_frequency", Numeric(5, 4), nullable=True),
Column("blue_frequency", Numeric(5, 4), nullable=True),
Column("purple_frequency", Numeric(5, 4), nullable=True)
)
metadata_obj.create_all(engine)
class Website:
pass
mapper_registry.map_imperatively(Website, websites_table)
# if the DB has never been loaded with data, seed it now
def seed_db(seed_filename):
with Session(engine) as session:
row_count = session.scalar(select(func.count()).select_from(Website))
if row_count > 0:
print('Website table has already been seeded, not reseeding so we dont lose work')
return
print('Website table has never been seeded, seeding it now')
seed_websites = []
seen_domains = set()
with open(seed_filename, 'r', encoding='latin-1') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
try:
website = session.scalars(select(Website).filter_by(domain=row['Website'])).one()
except Exception as exc:
# verify the domain isn't already present in our list of websites to add
if row['Website'] not in seen_domains:
seed_websites.append(
Website(firm_name=row['Account Name'], domain=row['Website'], primary_state=row['Primary State'], firm_primary_practice_area=row['Case Types Preferred'])
)
seen_domains.add(row['Website'])
session.add_all(seed_websites)
session.commit()
# seed the database with 2025 firms
seed_db('../seeds/all-firms-2025.csv')
Requirement already satisfied: sqlalchemy in /Users/bobby/Code/MeanPug/Digital/Brand/data-vis/venv/lib/python3.10/site-packages (2.0.39) Requirement already satisfied: typing-extensions>=4.6.0 in /Users/bobby/Code/MeanPug/Digital/Brand/data-vis/venv/lib/python3.10/site-packages (from sqlalchemy) (4.12.2) Website table has already been seeded, not reseeding so we dont lose work
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
def take_screenshot(url):
# check if the url doesn't have a protocol, add one if not.
if not url.startswith('http'):
url = 'https://' + url
print(f'fetching URL {url} to take a screenshot')
output_path = os.path.join(os.getcwd(), 'screenshot.png')
# Initialize a webdriver (e.g., Chrome, Firefox). Ensure the webdriver is in your PATH.
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()
# Navigate to the webpage
driver.get(url)
# verify the page has a body. If it doesn't, our approach will be slightly different (we won't screenshot the whole page)
try:
full_page = driver.find_element(By.TAG_NAME, "body")
except Exception:
full_page = None
if full_page:
width = driver.execute_script("return Math.max( document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth );")
height = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight );")
driver.set_window_size(width, height)
full_page.screenshot(output_path)
else:
driver.save_screenshot(output_path)
# Close the browser
driver.quit()
return output_path
def create_histogram(image_path):
image = cv2.imread(image_path)
b, g, r = cv2.split(image)
hist_b = cv2.calcHist([b], [0], None, [256], [0, 256])
hist_g = cv2.calcHist([g], [0], None, [256], [0, 256])
hist_r = cv2.calcHist([r], [0], None, [256], [0, 256])
plt.figure(figsize=(10, 5))
plt.plot(hist_b, color='blue', label='Blue')
plt.plot(hist_g, color='green', label='Green')
plt.plot(hist_r, color='red', label='Red')
plt.xlabel('Pixel Value')
plt.ylabel('Frequency')
plt.title('Color Spectrograph')
plt.legend()
plt.show()
create_histogram(take_screenshot('https://www.meanpug.com'))
create_histogram(take_screenshot('https://dallimarino.com'))
This isn't exactly what we want. We're getting the frequency distribution of the 3 color channels in the image, but it's going to be hard to extract discrete color names pixel-by-pixel using this approach. Trying something else.
from PIL import Image
# low-high ranges of RGB arrays for canonical color names
canonical_color_ranges = {
'red': '',
'green': '',
'blue': '',
'yellow': '',
'purple': '',
'orange': '',
'black': '',
'white': '',
'brown': '',
}
def get_pixel_frequency_for_color(image_path, color='red'):
"""
Calculates the frequency of orange color in an image.
Args:
image_path (str): The path to the image file.
Returns:
float: The frequency of orange color in the image (between 0 and 1).
Returns None if the image cannot be opened.
"""
try:
img = Image.open(image_path)
img = img.convert("RGB") # Ensure the image is in RGB format
pixels = np.array(img).reshape(-1, 3)
# Define orange color range (in RGB)
lower_orange = np.array([204, 96, 2])
upper_orange = np.array([255, 198, 77])
# Create a mask for orange pixels
orange_mask = np.all((pixels >= lower_orange) & (pixels <= upper_orange), axis=1)
# Count orange pixels
orange_pixels_count = np.sum(orange_mask)
# Calculate total pixels
total_pixels = pixels.shape[0]
# Calculate frequency
orange_frequency = orange_pixels_count / total_pixels if total_pixels > 0 else 0.0
return orange_frequency
except FileNotFoundError:
print(f"Error: Image file not found at {image_path}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
print(get_pixel_frequency_for_color(os.path.join(os.getcwd(), 'sample_screenshot.png'), 'orange'))
0.1481675502232143
This seems to be on the right path. However, mapping low-high ranges from colors to RGB will be difficult, and likely very inaccurate. Instead, using HSV format seems like the better option. Let's see what we can do.
# low-high HSV values for canonical colors
canonical_color_hsv_map = {
'red': (
(
np.array([0, 100, 100]),
np.array([9, 255, 255]),
),
(
np.array([156, 100, 100]),
np.array([180, 255, 255]),
)
),
'orange': (
np.array([10, 128, 128]),
np.array([25, 255, 255])
),
'yellow': (
np.array([26, 100, 100]),
np.array([36, 255, 255])
),
'green': (
np.array([37, 50, 50]),
np.array([85, 255, 255])
),
'blue': (
np.array([86, 50, 50]),
np.array([130, 255, 255])
),
'purple': (
np.array([131, 128, 128]),
np.array([155, 255, 255])
)
}
def get_pixel_frequency_for_colors(image_path):
""" rather than get the pixel frequency for a single color, instead get a dict mapping color names to their frequency in the image """
# Load the image in BGR format
color_frequency_map = {}
image_bgr = cv2.imread(image_path)
# Convert the BGR image to HSV
image_hsv = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2HSV)
# Now 'image_hsv' contains the image data in the HSV color space
# You can access individual channels (H, S, V) if needed
hue_channel, saturation_channel, value_channel = cv2.split(image_hsv)
# Calculate total pixels
total_pixels = hue_channel.size
print(f'calculated {total_pixels} total pixels in the image')
for canonical_color, hsv_range in canonical_color_hsv_map.items():
# red
if type(hsv_range[0]) is tuple:
red_lower_conditions, red_upper_conditions = hsv_range
pixel_match_condition = (
(
((hue_channel >= red_lower_conditions[0][0]) & (hue_channel <= red_lower_conditions[1][0])) &
((saturation_channel >= red_lower_conditions[0][1]) & (saturation_channel <= red_lower_conditions[1][1])) &
((value_channel >= red_lower_conditions[0][2]) & (value_channel <= red_lower_conditions[1][2]))
|
((hue_channel >= red_upper_conditions[0][0]) & (hue_channel <= red_upper_conditions[1][0])) &
((saturation_channel >= red_upper_conditions[0][1]) & (saturation_channel <= red_upper_conditions[1][1])) &
((value_channel >= red_upper_conditions[0][2]) & (value_channel <= red_upper_conditions[1][2]))
)
)
else:
pixel_match_condition = (
((hue_channel >= hsv_range[0][0]) & (hue_channel <= hsv_range[1][0])) &
((saturation_channel >= hsv_range[0][1]) & (saturation_channel <= hsv_range[1][1])) &
((value_channel >= hsv_range[0][2]) & (value_channel <= hsv_range[1][2]))
)
pixel_color_match = hue_channel[pixel_match_condition]
# finally, get ratio of pixel color match against all pixels
color_frequency_map[canonical_color] = pixel_color_match.size / total_pixels
# Removing whites and blacks for more interesting results
# color_frequency_map['black'] = value_channel[value_channel < 50].size / total_pixels
# color_frequency_map['white'] = value_channel[(value_channel >= 200) & (saturation_channel < 50)].size / total_pixels
return color_frequency_map
print(get_pixel_frequency_for_colors(os.path.join(os.getcwd(), 'sample_screenshot.png')))
calculated 4300800 total pixels in the image {'red': 0.0, 'orange': 0.14875627790178572, 'yellow': 0.0, 'green': 0.0, 'blue': 0.0, 'purple': 0.0, 'black': 0.020785667782738094, 'white': 0.785459216889881}
Now that is looking more like the source image. Lots plot as a histogram and compare it to our source image:
Source Image:
Distribution
def plot_color_distribution(color_frequency_map):
colors = color_frequency_map.keys()
values = color_frequency_map.values()
# replace white since you can't see it
chart_colors = [c if c != 'white' else 'darkgrey' for c in colors]
plt.bar(colors, values, color=chart_colors)
plt.title('Image Color Distribution')
plt.xlabel('Colors')
plt.ylabel('Frequency')
plt.show()
plot_color_distribution(get_pixel_frequency_for_colors(os.path.join(os.getcwd(), 'sample_screenshot.png')))
calculated 4300800 total pixels in the image
Now that we have color distribution frequency being calculated for a single site, it's time to start thinking about the next steps:
- Running this process over our larger dataset of firms
- Creating comparators to answer our primary questions
from sqlalchemy import select
from sqlalchemy.orm import Session
from collections import defaultdict
# start by making sure we have a way to create intermediary checkpoints for our work, otherwise I predict a lot of heartache :)
def checkpoint(domain, color_frequency_map):
# create session and add objects
with Session(engine) as session:
try:
website = session.scalars(select(Website).filter_by(domain=domain)).one()
except Exception as exc:
print(f'FAILED TO CHECKPOINT {domain}, it isnt in the DB')
raise exc
website.has_had_frequencies_calculated = True
website.red_frequency = color_frequency_map['red']
website.orange_frequency = color_frequency_map['orange']
website.yellow_frequency = color_frequency_map['yellow']
website.green_frequency = color_frequency_map['green']
website.blue_frequency = color_frequency_map['blue']
website.purple_frequency = color_frequency_map['purple']
session.commit()
def get_pixel_frequency_for_website(firm: Website):
""" as part of this, create a function to use the memoized pixel frequencies for a firm (website object) or - if not yet set - calculate the frequencies,
checkpoint, and return them """
if firm.has_had_frequencies_calculated:
return {
'red': firm.red_frequency,
'orange': firm.orange_frequency,
'yellow': firm.yellow_frequency,
'green': firm.green_frequency,
'blue': firm.blue_frequency,
'purple': firm.purple_frequency
}
else:
try:
output_screenshot = take_screenshot(firm.domain)
color_frequency_map = get_pixel_frequency_for_colors(output_screenshot)
except Exception as exc:
with Session(engine) as session:
local_firm = session.merge(firm)
local_firm.has_frequency_calculation_failure = True
session.add(local_firm)
session.commit()
raise exc
checkpoint(firm.domain, color_frequency_map)
return color_frequency_map
# (1 + 2) - What is the most popular color and what is the ranking of primary color for law firm websites (highest to lowest popularity)?
# In reality, what we probably want to see here is two different things:
# 1. A histogram of the single most prevalent color for every website
# 2. The relative popularity of each color across all sites. In other words, summing up the frequency of each color across all sites and showing that distribution
def graph_most_prevalent_color(color_frequency_maps, ax=None):
color_winner_counter = defaultdict(int)
for color_frequency_map in color_frequency_maps:
descending_frequency_tuples = sorted(color_frequency_map.items(), key=lambda item: item[1], reverse=True)
color_winner_counter[descending_frequency_tuples[0][0]] += 1
total_entry_count = len(color_frequency_maps)
custom_labels = ["({:.1%})".format(val / total_entry_count) for val in color_winner_counter.values()]
chart_colors = [c if c != 'white' else 'darkgrey' for c in color_winner_counter.keys()]
# Create the bar chart. If an axis is supplied, plot it there instead of building new subplots
if not ax:
fig, ax = plt.subplots()
# for the bar labels, show the percentage of the whole for each value
bars = ax.bar(color_winner_counter.keys(), color_winner_counter.values(), color=chart_colors)
ax.bar_label(bars, labels=custom_labels, fontsize=8)
# Set chart title and labels
ax.set_xlabel('Color')
ax.set_ylabel('Popularity')
if not ax:
ax.set_title('Distribution of Most Popular Color for Law Firm Websites')
# Show the plot
plt.show()
graph_most_prevalent_color([
{'red': .3, 'blue': .4, 'green': .3, 'yellow': 0, 'orange': 0, 'black': 0, 'white': 0, 'purple': 0},
{'red': .1, 'blue': .42, 'green': .1, 'yellow': 0, 'orange': .18, 'black': 0, 'white': 0, 'purple': 0},
{'red': .4, 'blue': .1, 'green': .3, 'yellow': 0, 'orange': 0, 'black': .2, 'white': 0, 'purple': 0},
])
# (3) - What colors are most commonly used together on law firm websites?
def graph_most_common_color_combinations(color_frequency_maps, ax=None):
""" there are two ways you _could_ think about this problem:
1. Find the pairs of colors that appear on-site together most frequently, regardless of their frequency to the whole
2. Find the colors that appear as the top two for frequency most often
I think number two makes more sense conceptually, so going to use that heuristic.
We'll do this as a heatmap with colors across both axes and the count of combinations as the heat
"""
# our final data representation needs to be an nxn matrix (where n = the number of colors binned). Each cell in the 2d array represents
# the count of instances where the intersection of the 2 colors are the most frequent on a website
colors = color_frequency_maps[0].keys()
heatmap_data = np.zeros([len(colors), len(colors)])
for i, color1 in enumerate(colors):
for j, color2 in enumerate(colors):
if color1 == color2:
heatmap_data[i][j] = -1
else:
# get the count of instances from the frequency maps where color1 and color2 are the two most frequent
combo_counter = 0
for color_frequency_map in color_frequency_maps:
descending_frequency_tuples = sorted(color_frequency_map.items(), key=lambda item: item[1], reverse=True)
most_frequent_colors = [_[0] for _ in descending_frequency_tuples[:2]]
if color1 in most_frequent_colors and color2 in most_frequent_colors:
combo_counter += 1
heatmap_data[i][j] = combo_counter
if not ax:
fig, ax = plt.subplots()
heatmap = ax.imshow(heatmap_data, cmap='viridis')
ax.set_xticks(np.arange(len(colors)))
ax.set_yticks(np.arange(len(colors)))
ax.set_xticklabels(colors)
ax.set_yticklabels(colors)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
for i in range(len(colors)):
for j in range(len(colors)):
text = ax.text(j, i, f"{heatmap_data[i, j]:.2f}" if heatmap_data[i, j] >= 0 else "Inf", ha="center", va="center", color="w")
if not ax:
ax.set_title("Primary Color Combinations Heatmap")
fig.colorbar(heatmap)
fig.tight_layout()
plt.show()
graph_most_common_color_combinations([
{'red': .3, 'blue': .4, 'green': .29, 'yellow': 0, 'orange': 0, 'black': 0, 'white': 0, 'purple': 0},
{'red': .19, 'blue': .42, 'green': .1, 'yellow': 0, 'orange': .18, 'black': 0, 'white': 0, 'purple': 0},
{'red': .4, 'blue': .1, 'green': .3, 'yellow': 0, 'orange': 0, 'black': .2, 'white': 0, 'purple': 0},
])
# (4) - What colors are most common for firms as grouped by their primary practice area?
def graph_popular_colors_by_practice_area(firms, color_frequency_maps):
""" for each practice area, subplot the heatmap of color combinations and most popular color distribution """
practice_area_color_maps = defaultdict(list)
for firm, color_frequency_map in zip(firms, color_frequency_maps):
practice_area_color_maps[firm.firm_primary_practice_area].append(color_frequency_map)
# start by grouping firms/color frequency maps by practice area
fig, axs = plt.subplots(len(practice_area_color_maps), 2, figsize=(15, len(practice_area_color_maps) * 5))
fig.suptitle('Popular Colors By Practice Area')
for i, (practice_area, color_frequency_maps) in enumerate(practice_area_color_maps.items()):
axs[i][0].set_title(f'{practice_area} Popular Colors')
axs[i][1].set_title(f'{practice_area} Popular Combinations')
graph_most_prevalent_color(color_frequency_maps, ax=axs[i][0])
graph_most_common_color_combinations(color_frequency_maps, ax=axs[i][1])
plt.tight_layout()
plt.show()
# test
with Session(engine) as session:
firms = session.scalars(select(Website).limit(4)).all()
color_frequency_maps = [get_pixel_frequency_for_website(f) for f in firms]
graph_popular_colors_by_practice_area(firms, color_frequency_maps)
# (5) - If we look at a state (like Florida), do the common colors differ from the national average?
def graph_popular_colors_by_primary_state(firms, color_frequency_maps):
""" for each state, subplot the heatmap of color combinations and most popular color distribution """
primary_state_color_maps = defaultdict(list)
for firm, color_frequency_map in zip(firms, color_frequency_maps):
primary_state_color_maps[firm.primary_state].append(color_frequency_map)
# start by grouping firms/color frequency maps by practice area
fig, axs = plt.subplots(len(primary_state_color_maps), 2, figsize=(15, len(primary_state_color_maps) * 5))
fig.suptitle('Popular Colors By Primary State')
for i, (primary_state, color_frequency_maps) in enumerate(primary_state_color_maps.items()):
axs[i][0].set_title(f'{primary_state} Popular Colors')
axs[i][1].set_title(f'{primary_state} Popular Combinations')
graph_most_prevalent_color(color_frequency_maps, ax=axs[i][0])
graph_most_common_color_combinations(color_frequency_maps, ax=axs[i][1])
plt.tight_layout()
plt.show()
# test
with Session(engine) as session:
firms = session.scalars(select(Website).limit(4)).all()
color_frequency_maps = [get_pixel_frequency_for_website(f) for f in firms]
graph_popular_colors_by_primary_state(firms, color_frequency_maps)
Let's run a test over a sample of 100 sites and see what type of results we get!
with Session(engine) as session:
firms = session.scalars(select(Website).limit(100)).all()
color_frequency_maps = []
for f in firms:
if f.has_frequency_calculation_failure:
continue
try:
color_frequency_maps.append(get_pixel_frequency_for_website(f))
except Exception as e:
print(f'got exception calculating pixel frequency for firm {f.domain} ({e})')
continue
graph_most_prevalent_color(color_frequency_maps)
graph_most_common_color_combinations(color_frequency_maps)
graph_popular_colors_by_practice_area(firms, color_frequency_maps)
graph_popular_colors_by_primary_state(firms, color_frequency_maps)
fetching URL https://mcmanuslawfirm.com/ to take a screenshot calculated 7682688 total pixels in the image fetching URL https://kttlaw.com/ to take a screenshot calculated 8767872 total pixels in the image fetching URL https://www.mcnallywi.com/ to take a screenshot calculated 2835648 total pixels in the image fetching URL https://www.davidamahlesq.com/ to take a screenshot calculated 14373504 total pixels in the image fetching URL https://www.thecowtownlawyer.com to take a screenshot calculated 10948608 total pixels in the image fetching URL https://www.baerlawoffice.com/ to take a screenshot calculated 10577088 total pixels in the image fetching URL https://www.meshbesherlawfirm.com/ to take a screenshot calculated 1384128 total pixels in the image fetching URL https://www.leattys.com/ to take a screenshot calculated 5825088 total pixels in the image fetching URL https://beyourvoice.com/ to take a screenshot calculated 2802816 total pixels in the image fetching URL https://www.rockylawfirm.com/ to take a screenshot calculated 2013120 total pixels in the image fetching URL https://desmondlawfirm.com to take a screenshot calculated 2502330 total pixels in the image fetching URL https://www.nunneleyfamilylaw.com to take a screenshot got exception calculating pixel frequency for firm www.nunneleyfamilylaw.com (Message: unknown error: net::ERR_SSL_PROTOCOL_ERROR (Session info: chrome=134.0.6998.119) Stacktrace: 0 chromedriver 0x00000001010076c8 cxxbridge1$str$ptr + 2791212 1 chromedriver 0x0000000100fffc9c cxxbridge1$str$ptr + 2759936 2 chromedriver 0x0000000100b51e30 cxxbridge1$string$len + 92928 3 chromedriver 0x0000000100b4a3c4 cxxbridge1$string$len + 61588 4 chromedriver 0x0000000100b3c5ac cxxbridge1$string$len + 4732 5 chromedriver 0x0000000100b3df8c cxxbridge1$string$len + 11356 6 chromedriver 0x0000000100b3ca14 cxxbridge1$string$len + 5860 7 chromedriver 0x0000000100b3c388 cxxbridge1$string$len + 4184 8 chromedriver 0x0000000100b3c0d4 cxxbridge1$string$len + 3492 9 chromedriver 0x0000000100b39dec chromedriver + 187884 10 chromedriver 0x0000000100b3a910 chromedriver + 190736 11 chromedriver 0x0000000100b54de0 cxxbridge1$string$len + 105136 12 chromedriver 0x0000000100bdb118 cxxbridge1$string$len + 654824 13 chromedriver 0x0000000100bda5f8 cxxbridge1$string$len + 651976 14 chromedriver 0x0000000100b8d2fc cxxbridge1$string$len + 335820 15 chromedriver 0x0000000100fcc6c4 cxxbridge1$str$ptr + 2549544 16 chromedriver 0x0000000100fcf988 cxxbridge1$str$ptr + 2562540 17 chromedriver 0x0000000100fac71c cxxbridge1$str$ptr + 2418560 18 chromedriver 0x0000000100fd01e8 cxxbridge1$str$ptr + 2564684 19 chromedriver 0x0000000100f9d750 cxxbridge1$str$ptr + 2357172 20 chromedriver 0x0000000100feff58 cxxbridge1$str$ptr + 2695100 21 chromedriver 0x0000000100ff00e0 cxxbridge1$str$ptr + 2695492 22 chromedriver 0x0000000100fff910 cxxbridge1$str$ptr + 2759028 23 libsystem_pthread.dylib 0x0000000186d91034 _pthread_start + 136 24 libsystem_pthread.dylib 0x0000000186d8be3c thread_start + 8 ) fetching URL https://www.jcdelaw.com/ to take a screenshot calculated 5142528 total pixels in the image fetching URL https://www.gmglawfirm.com/ to take a screenshot calculated 9825408 total pixels in the image fetching URL https://madialaw.com/ to take a screenshot calculated 8239104 total pixels in the image fetching URL https://gainsburghbenjamin.com/ to take a screenshot got exception calculating pixel frequency for firm https://gainsburghbenjamin.com/ (Message: unknown error: unhandled inspector error: {"code":-32000,"message":"Cannot take screenshot with 0 height."} (Session info: chrome=134.0.6998.119) Stacktrace: 0 chromedriver 0x0000000100cd36c8 cxxbridge1$str$ptr + 2791212 1 chromedriver 0x0000000100ccbc9c cxxbridge1$str$ptr + 2759936 2 chromedriver 0x000000010081de30 cxxbridge1$string$len + 92928 3 chromedriver 0x00000001008077a4 cxxbridge1$string$len + 1140 4 chromedriver 0x0000000100805dc4 chromedriver + 187844 5 chromedriver 0x0000000100806a44 chromedriver + 191044 6 chromedriver 0x000000010082acc8 cxxbridge1$string$len + 145816 7 chromedriver 0x0000000100862470 cxxbridge1$string$len + 373056 8 chromedriver 0x000000010085a818 cxxbridge1$string$len + 341224 9 chromedriver 0x00000001008a65f8 cxxbridge1$string$len + 651976 10 chromedriver 0x00000001008592fc cxxbridge1$string$len + 335820 11 chromedriver 0x0000000100c986c4 cxxbridge1$str$ptr + 2549544 12 chromedriver 0x0000000100c9b988 cxxbridge1$str$ptr + 2562540 13 chromedriver 0x0000000100c7871c cxxbridge1$str$ptr + 2418560 14 chromedriver 0x0000000100c9c1e8 cxxbridge1$str$ptr + 2564684 15 chromedriver 0x0000000100c69750 cxxbridge1$str$ptr + 2357172 16 chromedriver 0x0000000100cbbf58 cxxbridge1$str$ptr + 2695100 17 chromedriver 0x0000000100cbc0e0 cxxbridge1$str$ptr + 2695492 18 chromedriver 0x0000000100ccb910 cxxbridge1$str$ptr + 2759028 19 libsystem_pthread.dylib 0x0000000186d91034 _pthread_start + 136 20 libsystem_pthread.dylib 0x0000000186d8be3c thread_start + 8 ) fetching URL https://www.gnclaw.com/ to take a screenshot calculated 1209600 total pixels in the image fetching URL https://www.meyers-flowers.com/ to take a screenshot calculated 21326976 total pixels in the image fetching URL https://www.malmanlaw.com/ to take a screenshot calculated 9704448 total pixels in the image fetching URL https://www.daveabels.com/ to take a screenshot got exception calculating pixel frequency for firm https://www.daveabels.com/ (Message: unknown error: unhandled inspector error: {"code":-32000,"message":"Unable to capture screenshot"} (Session info: chrome=134.0.6998.119) Stacktrace: 0 chromedriver 0x00000001005e36c8 cxxbridge1$str$ptr + 2791212 1 chromedriver 0x00000001005dbc9c cxxbridge1$str$ptr + 2759936 2 chromedriver 0x000000010012de30 cxxbridge1$string$len + 92928 3 chromedriver 0x00000001001177a4 cxxbridge1$string$len + 1140 4 chromedriver 0x0000000100115dc4 chromedriver + 187844 5 chromedriver 0x0000000100116a44 chromedriver + 191044 6 chromedriver 0x000000010013acc8 cxxbridge1$string$len + 145816 7 chromedriver 0x0000000100172470 cxxbridge1$string$len + 373056 8 chromedriver 0x000000010016a818 cxxbridge1$string$len + 341224 9 chromedriver 0x00000001001b65f8 cxxbridge1$string$len + 651976 10 chromedriver 0x00000001001692fc cxxbridge1$string$len + 335820 11 chromedriver 0x00000001005a86c4 cxxbridge1$str$ptr + 2549544 12 chromedriver 0x00000001005ab988 cxxbridge1$str$ptr + 2562540 13 chromedriver 0x000000010058871c cxxbridge1$str$ptr + 2418560 14 chromedriver 0x00000001005ac1e8 cxxbridge1$str$ptr + 2564684 15 chromedriver 0x0000000100579750 cxxbridge1$str$ptr + 2357172 16 chromedriver 0x00000001005cbf58 cxxbridge1$str$ptr + 2695100 17 chromedriver 0x00000001005cc0e0 cxxbridge1$str$ptr + 2695492 18 chromedriver 0x00000001005db910 cxxbridge1$str$ptr + 2759028 19 libsystem_pthread.dylib 0x0000000186d91034 _pthread_start + 136 20 libsystem_pthread.dylib 0x0000000186d8be3c thread_start + 8 ) fetching URL https://www.thehigginsfirm.com/ to take a screenshot got exception calculating pixel frequency for firm https://www.thehigginsfirm.com/ (Message: unknown error: unhandled inspector error: {"code":-32000,"message":"Unable to capture screenshot"} (Session info: chrome=134.0.6998.119) Stacktrace: 0 chromedriver 0x00000001027f36c8 cxxbridge1$str$ptr + 2791212 1 chromedriver 0x00000001027ebc9c cxxbridge1$str$ptr + 2759936 2 chromedriver 0x000000010233de30 cxxbridge1$string$len + 92928 3 chromedriver 0x00000001023277a4 cxxbridge1$string$len + 1140 4 chromedriver 0x0000000102325dc4 chromedriver + 187844 5 chromedriver 0x0000000102326a44 chromedriver + 191044 6 chromedriver 0x000000010234acc8 cxxbridge1$string$len + 145816 7 chromedriver 0x0000000102382470 cxxbridge1$string$len + 373056 8 chromedriver 0x000000010237a818 cxxbridge1$string$len + 341224 9 chromedriver 0x00000001023c65f8 cxxbridge1$string$len + 651976 10 chromedriver 0x00000001023792fc cxxbridge1$string$len + 335820 11 chromedriver 0x00000001027b86c4 cxxbridge1$str$ptr + 2549544 12 chromedriver 0x00000001027bb988 cxxbridge1$str$ptr + 2562540 13 chromedriver 0x000000010279871c cxxbridge1$str$ptr + 2418560 14 chromedriver 0x00000001027bc1e8 cxxbridge1$str$ptr + 2564684 15 chromedriver 0x0000000102789750 cxxbridge1$str$ptr + 2357172 16 chromedriver 0x00000001027dbf58 cxxbridge1$str$ptr + 2695100 17 chromedriver 0x00000001027dc0e0 cxxbridge1$str$ptr + 2695492 18 chromedriver 0x00000001027eb910 cxxbridge1$str$ptr + 2759028 19 libsystem_pthread.dylib 0x0000000186d91034 _pthread_start + 136 20 libsystem_pthread.dylib 0x0000000186d8be3c thread_start + 8 ) fetching URL https://www.alpertfellowslaw.com/ to take a screenshot calculated 1384128 total pixels in the image fetching URL https://www.darrenedlaw.com/ to take a screenshot calculated 14259456 total pixels in the image fetching URL https://www.skillernfirm.com to take a screenshot calculated 14277172 total pixels in the image fetching URL https://www.ohalloransimmons.com to take a screenshot calculated 746432 total pixels in the image fetching URL https://socklaw.com to take a screenshot calculated 4574016 total pixels in the image fetching URL https://www.treyyateslaw.com to take a screenshot calculated 10094976 total pixels in the image fetching URL https://www.susanbrownlaw.com to take a screenshot calculated 6927552 total pixels in the image fetching URL https://swartzlaw.com/ to take a screenshot calculated 12382848 total pixels in the image fetching URL https://fletcherandphillips.com to take a screenshot calculated 5901120 total pixels in the image fetching URL http://www.janmulligan.com to take a screenshot calculated 6336576 total pixels in the image fetching URL https://www.martindale.com/attorney/robert-j-pecora-112802/ to take a screenshot calculated 1524096 total pixels in the image fetching URL http://www.lindadankmanlaw.com to take a screenshot calculated 25680 total pixels in the image fetching URL http://www.peachweathers.com to take a screenshot calculated 6486912 total pixels in the image fetching URL http://www.ledgerlaw.com to take a screenshot calculated 15135552 total pixels in the image fetching URL http://www.braytonlaw.com to take a screenshot calculated 13944420 total pixels in the image fetching URL https://www.caemployeelawyer.com/ to take a screenshot calculated 32832 total pixels in the image fetching URL https://www.salamatilaw.com to take a screenshot calculated 17155584 total pixels in the image fetching URL http://www.dfis-law.com to take a screenshot calculated 7105536 total pixels in the image fetching URL http://www.rosensaba.com to take a screenshot calculated 30123756 total pixels in the image fetching URL http://www.nieldlaw.com to take a screenshot calculated 357808 total pixels in the image fetching URL http://www.meissnerlaw.com to take a screenshot calculated 1676160 total pixels in the image fetching URL http://ellisinjurylaw.com to take a screenshot calculated 15619392 total pixels in the image fetching URL http://www.efglawyer.com to take a screenshot calculated 9906624 total pixels in the image fetching URL http://www.czechandhowell.com to take a screenshot calculated 2890944 total pixels in the image fetching URL https://www.larsonlegalservices.com/ to take a screenshot calculated 9789120 total pixels in the image fetching URL http://www.rwolaw.com to take a screenshot calculated 4981824 total pixels in the image fetching URL https://www.metierlaw.com to take a screenshot calculated 12569472 total pixels in the image fetching URL http://www.mcelyealawoffice.com to take a screenshot calculated 1384128 total pixels in the image fetching URL https://www.hsdlawfirm.com to take a screenshot calculated 16391808 total pixels in the image
domains = ['https://www.knoxbotelerlaw.com', 'https://www.knrlegal.com', 'https://www.kollislaw.com', 'https://www.konellruggiero.com', 'https://www.konicekdillonlaw.com', 'https://www.kplegal.com/', 'https://www.krhlaw.com/', 'https://www.krwlawyers.com', 'https://www.kvpclaw.com', 'https://www.kylawpractice.com', 'https://www.kylerobbinslaw.com']
color_frequency_maps = []
for domain in domains:
take_screenshot(domain)
color_frequency_maps.append(get_pixel_frequency_for_colors(os.path.join(os.getcwd(), 'screenshot.png')))
graph_most_prevalent_color(color_frequency_maps)
graph_most_common_color_combinations(color_frequency_maps)
calculated 25680 total pixels in the image calculated 24223104 total pixels in the image calculated 1389312 total pixels in the image calculated 6931008 total pixels in the image calculated 1384128 total pixels in the image calculated 25680 total pixels in the image calculated 350960 total pixels in the image calculated 9052992 total pixels in the image calculated 25680 total pixels in the image calculated 14746752 total pixels in the image calculated 6497280 total pixels in the image
Small scale testing is looking good! Let's run this for real :)
with Session(engine) as session:
# 10,000 firms
firms = session.scalars(select(Website).limit(10000)).all()
color_frequency_maps = []
for f in firms:
if f.has_frequency_calculation_failure:
continue
try:
color_frequency_maps.append(get_pixel_frequency_for_website(f))
except Exception as e:
print(f'got exception calculating pixel frequency for firm {f.domain} ({e})')
continue
graph_most_prevalent_color(color_frequency_maps)
graph_most_common_color_combinations(color_frequency_maps)
graph_popular_colors_by_practice_area(firms, color_frequency_maps)
graph_popular_colors_by_primary_state(firms, color_frequency_maps)