Law Firm Website Colors¶

Goal¶

To understand the distribution of colors on law firm websites. However, there are a number of ways that color distribution could be analyzed, including:

  • For any given site, the bucketed priority order of colors (i.e. blue is most prevalent, followed by red, followed by black).
  • For any given site, a histogram of bucketed colors (i.e. page is 50% blue, 30% red, 20% black)
  • The blended histogram across all sites under test (i.e. ecosystem s 50% blue, 30% red, 20% black).

What I think is most interesting would be to understand the answers to questions like:

  1. What color is the primary color for the majority of law firm websites?
  2. What is the ranking of primary color for law firm websites (highest to lowest popularity)?
  3. What colors are most commonly used together on law firm websites?
  4. What colors are most common for firms as grouped by their primary practice area? In other words, do personal injury firms share common colors and criminal defense tend to lean towards different color combinations?
  5. If we look at a state (like Florida), do the common colors differ from the national average?
In [202]:
# get a DB set up so we don't have to perform expensive work more than once!
!pip install sqlalchemy

from sqlalchemy import create_engine, MetaData, Table, Column, Numeric, String, Boolean, select
from sqlalchemy.orm import registry, Session
from sqlalchemy import func
import csv

engine = create_engine('sqlite:///law_firm_websites.sqlite')
connection = engine.connect()

mapper_registry = registry()
metadata_obj = MetaData()

websites_table = Table(
    "websites",
    metadata_obj,
    Column("domain", String(120), primary_key=True),
    Column("firm_name", String(100), nullable=False),
    Column("primary_state", String(100), nullable=True),
    Column("firm_primary_practice_area", String(100), nullable=True),
    # calculated columns
    Column("has_had_frequencies_calculated", Boolean, default=False),
    Column("has_frequency_calculation_failure", Boolean, default=False),
    Column("red_frequency", Numeric(5, 4), nullable=True),    
    Column("orange_frequency", Numeric(5, 4), nullable=True),
    Column("yellow_frequency", Numeric(5, 4), nullable=True),
    Column("green_frequency", Numeric(5, 4), nullable=True),
    Column("blue_frequency", Numeric(5, 4), nullable=True),
    Column("purple_frequency", Numeric(5, 4), nullable=True)
)

metadata_obj.create_all(engine)

class Website:
    pass

    
mapper_registry.map_imperatively(Website, websites_table)

# if the DB has never been loaded with data, seed it now
def seed_db(seed_filename):
    with Session(engine) as session:
        row_count = session.scalar(select(func.count()).select_from(Website))
        if row_count > 0:
            print('Website table has already been seeded, not reseeding so we dont lose work')
            return

        print('Website table has never been seeded, seeding it now')

        seed_websites = []
        seen_domains = set()
        with open(seed_filename, 'r', encoding='latin-1') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    website = session.scalars(select(Website).filter_by(domain=row['Website'])).one()
                except Exception as exc:
                    # verify the domain isn't already present in our list of websites to add                    
                    if row['Website'] not in seen_domains:
                        seed_websites.append(
                            
                            Website(firm_name=row['Account Name'], domain=row['Website'], primary_state=row['Primary State'], firm_primary_practice_area=row['Case Types Preferred'])
                        )       
                        seen_domains.add(row['Website'])
                
        session.add_all(seed_websites)
        session.commit()

# seed the database with 2025 firms
seed_db('../seeds/all-firms-2025.csv')
Requirement already satisfied: sqlalchemy in /Users/bobby/Code/MeanPug/Digital/Brand/data-vis/venv/lib/python3.10/site-packages (2.0.39)
Requirement already satisfied: typing-extensions>=4.6.0 in /Users/bobby/Code/MeanPug/Digital/Brand/data-vis/venv/lib/python3.10/site-packages (from sqlalchemy) (4.12.2)
Website table has already been seeded, not reseeding so we dont lose work
In [193]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os


def take_screenshot(url):    
    # check if the url doesn't have a protocol, add one if not.
    if not url.startswith('http'):
        url = 'https://' + url
        
    print(f'fetching URL {url} to take a screenshot')
    
    output_path = os.path.join(os.getcwd(), 'screenshot.png')
    
    # Initialize a webdriver (e.g., Chrome, Firefox). Ensure the webdriver is in your PATH.
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--start-maximized')    
    driver = webdriver.Chrome(options=chrome_options)    
    driver.maximize_window()
    
    # Navigate to the webpage
    driver.get(url)    

    # verify the page has a body. If it doesn't, our approach will be slightly different (we won't screenshot the whole page)
    try:
        full_page = driver.find_element(By.TAG_NAME, "body")
    except Exception:
        full_page = None
        
    if full_page:
        width = driver.execute_script("return Math.max( document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth );")
        height = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight );")    
        driver.set_window_size(width, height)
    
        full_page.screenshot(output_path)            
    else:
        driver.save_screenshot(output_path)
    
    # Close the browser
    driver.quit()

    return output_path


def create_histogram(image_path):
    image = cv2.imread(image_path)
    
    b, g, r = cv2.split(image)
    hist_b = cv2.calcHist([b], [0], None, [256], [0, 256])
    hist_g = cv2.calcHist([g], [0], None, [256], [0, 256])
    hist_r = cv2.calcHist([r], [0], None, [256], [0, 256])

    plt.figure(figsize=(10, 5))
    plt.plot(hist_b, color='blue', label='Blue')
    plt.plot(hist_g, color='green', label='Green')
    plt.plot(hist_r, color='red', label='Red')
    plt.xlabel('Pixel Value')
    plt.ylabel('Frequency')
    plt.title('Color Spectrograph')
    plt.legend()
    plt.show()
In [144]:
create_histogram(take_screenshot('https://www.meanpug.com'))
No description has been provided for this image
In [140]:
create_histogram(take_screenshot('https://dallimarino.com'))
No description has been provided for this image

This isn't exactly what we want. We're getting the frequency distribution of the 3 color channels in the image, but it's going to be hard to extract discrete color names pixel-by-pixel using this approach. Trying something else.

In [41]:
from PIL import Image

# low-high ranges of RGB arrays for canonical color names
canonical_color_ranges = {
    'red': '',
    'green': '',
    'blue': '',
    'yellow': '',
    'purple': '',
    'orange': '',
    'black': '',
    'white': '',
    'brown': '',
}


def get_pixel_frequency_for_color(image_path, color='red'):
    """
    Calculates the frequency of orange color in an image.

    Args:
        image_path (str): The path to the image file.

    Returns:
        float: The frequency of orange color in the image (between 0 and 1).
               Returns None if the image cannot be opened.
    """
    try:
        img = Image.open(image_path)
        img = img.convert("RGB")  # Ensure the image is in RGB format
        pixels = np.array(img).reshape(-1, 3)        

        # Define orange color range (in RGB)
        lower_orange = np.array([204, 96, 2])
        upper_orange = np.array([255, 198, 77])

        # Create a mask for orange pixels
        orange_mask = np.all((pixels >= lower_orange) & (pixels <= upper_orange), axis=1)        

        # Count orange pixels
        orange_pixels_count = np.sum(orange_mask)        

        # Calculate total pixels
        total_pixels = pixels.shape[0]        

        # Calculate frequency
        orange_frequency = orange_pixels_count / total_pixels if total_pixels > 0 else 0.0

        return orange_frequency
    except FileNotFoundError:
         print(f"Error: Image file not found at {image_path}")
         return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
In [125]:
print(get_pixel_frequency_for_color(os.path.join(os.getcwd(), 'sample_screenshot.png'), 'orange'))
0.1481675502232143

This seems to be on the right path. However, mapping low-high ranges from colors to RGB will be difficult, and likely very inaccurate. Instead, using HSV format seems like the better option. Let's see what we can do.

In [227]:
# low-high HSV values for canonical colors
canonical_color_hsv_map = {
    'red': (
        (
            np.array([0, 100, 100]),
            np.array([9, 255, 255]),
        ),
        (
            np.array([156, 100, 100]),
            np.array([180, 255, 255]),
        )
    ),
    'orange': (
        np.array([10, 128, 128]),
        np.array([25, 255, 255])
    ),
    'yellow': (
        np.array([26, 100, 100]),
        np.array([36, 255, 255])
    ),
    'green': (
        np.array([37, 50, 50]),
        np.array([85, 255, 255])
    ),
    'blue': (
        np.array([86, 50, 50]),
        np.array([130, 255, 255])
    ),
    'purple': (
        np.array([131, 128, 128]),
        np.array([155, 255, 255])
    )
}

def get_pixel_frequency_for_colors(image_path):
    """ rather than get the pixel frequency for a single color, instead get a dict mapping color names to their frequency in the image """
    # Load the image in BGR format
    color_frequency_map = {}
    
    image_bgr = cv2.imread(image_path)
    
    # Convert the BGR image to HSV
    image_hsv = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2HSV)
    
    # Now 'image_hsv' contains the image data in the HSV color space
    # You can access individual channels (H, S, V) if needed
    hue_channel, saturation_channel, value_channel = cv2.split(image_hsv)
    
    # Calculate total pixels
    total_pixels = hue_channel.size
    print(f'calculated {total_pixels} total pixels in the image')
    
    for canonical_color, hsv_range in canonical_color_hsv_map.items():        
        # red
        if type(hsv_range[0]) is tuple:
            red_lower_conditions, red_upper_conditions = hsv_range
            pixel_match_condition = (
                (
                    ((hue_channel >= red_lower_conditions[0][0]) & (hue_channel <= red_lower_conditions[1][0])) &
                    ((saturation_channel >= red_lower_conditions[0][1]) & (saturation_channel <= red_lower_conditions[1][1])) &
                    ((value_channel >= red_lower_conditions[0][2]) & (value_channel <= red_lower_conditions[1][2])) 
                    | 
                    ((hue_channel >= red_upper_conditions[0][0]) & (hue_channel <= red_upper_conditions[1][0])) &
                    ((saturation_channel >= red_upper_conditions[0][1]) & (saturation_channel <= red_upper_conditions[1][1])) &
                    ((value_channel >= red_upper_conditions[0][2]) & (value_channel <= red_upper_conditions[1][2]))
                )
            )
        else:
            pixel_match_condition = (
                ((hue_channel >= hsv_range[0][0]) & (hue_channel <= hsv_range[1][0])) &
                ((saturation_channel >= hsv_range[0][1]) & (saturation_channel <= hsv_range[1][1])) &
                ((value_channel >= hsv_range[0][2]) & (value_channel <= hsv_range[1][2]))
            )
                
        pixel_color_match = hue_channel[pixel_match_condition]
    
        # finally, get ratio of pixel color match against all pixels
        color_frequency_map[canonical_color] = pixel_color_match.size / total_pixels
        
    # Removing whites and blacks for more interesting results
    # color_frequency_map['black'] = value_channel[value_channel < 50].size / total_pixels
    # color_frequency_map['white'] = value_channel[(value_channel >= 200) & (saturation_channel < 50)].size / total_pixels
    
    
    return color_frequency_map  
In [77]:
print(get_pixel_frequency_for_colors(os.path.join(os.getcwd(), 'sample_screenshot.png')))
calculated 4300800 total pixels in the image
{'red': 0.0, 'orange': 0.14875627790178572, 'yellow': 0.0, 'green': 0.0, 'blue': 0.0, 'purple': 0.0, 'black': 0.020785667782738094, 'white': 0.785459216889881}

Now that is looking more like the source image. Lots plot as a histogram and compare it to our source image:

Source Image: Brown Immigration Screenshot

Distribution

In [82]:
def plot_color_distribution(color_frequency_map):
    colors = color_frequency_map.keys()
    values = color_frequency_map.values()

    # replace white since you can't see it
    chart_colors = [c if c != 'white' else 'darkgrey' for c in colors]
    plt.bar(colors, values, color=chart_colors)
    plt.title('Image Color Distribution')
    plt.xlabel('Colors')
    plt.ylabel('Frequency')

    plt.show()
In [83]:
plot_color_distribution(get_pixel_frequency_for_colors(os.path.join(os.getcwd(), 'sample_screenshot.png')))
calculated 4300800 total pixels in the image
No description has been provided for this image

Now that we have color distribution frequency being calculated for a single site, it's time to start thinking about the next steps:

  1. Running this process over our larger dataset of firms
  2. Creating comparators to answer our primary questions
In [213]:
from sqlalchemy import select
from sqlalchemy.orm import Session
from collections import defaultdict

# start by making sure we have a way to create intermediary checkpoints for our work, otherwise I predict a lot of heartache :)
def checkpoint(domain, color_frequency_map):
    # create session and add objects
    with Session(engine) as session:
        try:
            website = session.scalars(select(Website).filter_by(domain=domain)).one()
        except Exception as exc:
            print(f'FAILED TO CHECKPOINT {domain}, it isnt in the DB')
            raise exc

        website.has_had_frequencies_calculated = True
        website.red_frequency = color_frequency_map['red']
        website.orange_frequency = color_frequency_map['orange']
        website.yellow_frequency = color_frequency_map['yellow']
        website.green_frequency = color_frequency_map['green']
        website.blue_frequency = color_frequency_map['blue']
        website.purple_frequency = color_frequency_map['purple']        
        
        session.commit()


def get_pixel_frequency_for_website(firm: Website):
    """ as part of this, create a function to use the memoized pixel frequencies for a firm (website object) or - if not yet set - calculate the frequencies, 
    checkpoint, and return them """
    if firm.has_had_frequencies_calculated:
        return {
            'red': firm.red_frequency, 
            'orange': firm.orange_frequency, 
            'yellow': firm.yellow_frequency, 
            'green': firm.green_frequency, 
            'blue': firm.blue_frequency,
            'purple': firm.purple_frequency
        }
    else:
        try:
            output_screenshot = take_screenshot(firm.domain)
            color_frequency_map = get_pixel_frequency_for_colors(output_screenshot)
        except Exception as exc:            
            with Session(engine) as session:
                local_firm = session.merge(firm)
                local_firm.has_frequency_calculation_failure = True
                session.add(local_firm)
                session.commit()
                
            raise exc
            
        checkpoint(firm.domain, color_frequency_map)
        return color_frequency_map
        
In [169]:
# (1 + 2) - What is the most popular color and what is the ranking of primary color for law firm websites (highest to lowest popularity)?
# In reality, what we probably want to see here is two different things:
# 1. A histogram of the single most prevalent color for every website
# 2. The relative popularity of each color across all sites. In other words, summing up the frequency of each color across all sites and showing that distribution
def graph_most_prevalent_color(color_frequency_maps, ax=None):
    color_winner_counter = defaultdict(int)
    
    for color_frequency_map in color_frequency_maps:
        descending_frequency_tuples = sorted(color_frequency_map.items(), key=lambda item: item[1], reverse=True)
        color_winner_counter[descending_frequency_tuples[0][0]] += 1

    total_entry_count = len(color_frequency_maps)
    custom_labels = ["({:.1%})".format(val / total_entry_count) for val in color_winner_counter.values()]
    chart_colors = [c if c != 'white' else 'darkgrey' for c in color_winner_counter.keys()]    
    
    # Create the bar chart. If an axis is supplied, plot it there instead of building new subplots
    if not ax:
        fig, ax = plt.subplots()    
    
    # for the bar labels, show the percentage of the whole for each value    
    bars = ax.bar(color_winner_counter.keys(), color_winner_counter.values(), color=chart_colors)
    ax.bar_label(bars, labels=custom_labels, fontsize=8)
    
    # Set chart title and labels
    ax.set_xlabel('Color')
    ax.set_ylabel('Popularity')    

    if not ax:
        ax.set_title('Distribution of Most Popular Color for Law Firm Websites')
        
        # Show the plot
        plt.show()


graph_most_prevalent_color([
    {'red': .3, 'blue': .4, 'green': .3, 'yellow': 0, 'orange': 0, 'black': 0, 'white': 0, 'purple': 0},
    {'red': .1, 'blue': .42, 'green': .1, 'yellow': 0, 'orange': .18, 'black': 0, 'white': 0, 'purple': 0},
    {'red': .4, 'blue': .1, 'green': .3, 'yellow': 0, 'orange': 0, 'black': .2, 'white': 0, 'purple': 0},
])
No description has been provided for this image
In [171]:
# (3) - What colors are most commonly used together on law firm websites?
def graph_most_common_color_combinations(color_frequency_maps, ax=None):
    """ there are two ways you _could_ think about this problem:
    1. Find the pairs of colors that appear on-site together most frequently, regardless of their frequency to the whole
    2. Find the colors that appear as the top two for frequency most often

    I think number two makes more sense conceptually, so going to use that heuristic.

    We'll do this as a heatmap with colors across both axes and the count of combinations as the heat
    """
    # our final data representation needs to be an nxn matrix (where n = the number of colors binned). Each cell in the 2d array represents
    # the count of instances where the intersection of the 2 colors are the most frequent on a website
    colors = color_frequency_maps[0].keys()
    heatmap_data = np.zeros([len(colors), len(colors)])
    
    for i, color1 in enumerate(colors):
        for j, color2 in enumerate(colors):
            if color1 == color2:
                heatmap_data[i][j] = -1
            else:
                # get the count of instances from the frequency maps where color1 and color2 are the two most frequent
                combo_counter = 0
                for color_frequency_map in color_frequency_maps:
                    descending_frequency_tuples = sorted(color_frequency_map.items(), key=lambda item: item[1], reverse=True)
                    most_frequent_colors = [_[0] for _ in descending_frequency_tuples[:2]]
                    if color1 in most_frequent_colors and color2 in most_frequent_colors:
                        combo_counter += 1
                        
                heatmap_data[i][j] = combo_counter

    if not ax:
        fig, ax = plt.subplots()
        
    heatmap = ax.imshow(heatmap_data, cmap='viridis')
    
    ax.set_xticks(np.arange(len(colors)))
    ax.set_yticks(np.arange(len(colors)))
    
    ax.set_xticklabels(colors)
    ax.set_yticklabels(colors)
    
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    
    for i in range(len(colors)):
        for j in range(len(colors)):
            text = ax.text(j, i, f"{heatmap_data[i, j]:.2f}" if heatmap_data[i, j] >= 0 else "Inf", ha="center", va="center", color="w")
        
    if not ax:
        ax.set_title("Primary Color Combinations Heatmap")
        
        fig.colorbar(heatmap)
        fig.tight_layout()
        
        plt.show()

graph_most_common_color_combinations([
    {'red': .3, 'blue': .4, 'green': .29, 'yellow': 0, 'orange': 0, 'black': 0, 'white': 0, 'purple': 0},
    {'red': .19, 'blue': .42, 'green': .1, 'yellow': 0, 'orange': .18, 'black': 0, 'white': 0, 'purple': 0},
    {'red': .4, 'blue': .1, 'green': .3, 'yellow': 0, 'orange': 0, 'black': .2, 'white': 0, 'purple': 0},
])
No description has been provided for this image
In [225]:
# (4) - What colors are most common for firms as grouped by their primary practice area?
def graph_popular_colors_by_practice_area(firms, color_frequency_maps):
    """ for each practice area, subplot the heatmap of color combinations and most popular color distribution """    
    practice_area_color_maps = defaultdict(list)
    for firm, color_frequency_map in zip(firms, color_frequency_maps):
        practice_area_color_maps[firm.firm_primary_practice_area].append(color_frequency_map)
    
    # start by grouping firms/color frequency maps by practice area
    fig, axs = plt.subplots(len(practice_area_color_maps), 2, figsize=(15, len(practice_area_color_maps) * 5))
    fig.suptitle('Popular Colors By Practice Area')

    for i, (practice_area, color_frequency_maps) in enumerate(practice_area_color_maps.items()):
        axs[i][0].set_title(f'{practice_area} Popular Colors')
        axs[i][1].set_title(f'{practice_area} Popular Combinations')
        graph_most_prevalent_color(color_frequency_maps, ax=axs[i][0])
        graph_most_common_color_combinations(color_frequency_maps, ax=axs[i][1])        

    plt.tight_layout()
    plt.show()


# test
with Session(engine) as session:
    firms = session.scalars(select(Website).limit(4)).all()
    color_frequency_maps = [get_pixel_frequency_for_website(f) for f in firms]
    graph_popular_colors_by_practice_area(firms, color_frequency_maps)
No description has been provided for this image
In [222]:
# (5) - If we look at a state (like Florida), do the common colors differ from the national average?
def graph_popular_colors_by_primary_state(firms, color_frequency_maps):
    """ for each state, subplot the heatmap of color combinations and most popular color distribution """    
    primary_state_color_maps = defaultdict(list)
    for firm, color_frequency_map in zip(firms, color_frequency_maps):
        primary_state_color_maps[firm.primary_state].append(color_frequency_map)
    
    # start by grouping firms/color frequency maps by practice area
    fig, axs = plt.subplots(len(primary_state_color_maps), 2, figsize=(15, len(primary_state_color_maps) * 5))
    fig.suptitle('Popular Colors By Primary State')

    for i, (primary_state, color_frequency_maps) in enumerate(primary_state_color_maps.items()):
        axs[i][0].set_title(f'{primary_state} Popular Colors')
        axs[i][1].set_title(f'{primary_state} Popular Combinations')
        graph_most_prevalent_color(color_frequency_maps, ax=axs[i][0])
        graph_most_common_color_combinations(color_frequency_maps, ax=axs[i][1])

    plt.tight_layout()
    plt.show()


# test
with Session(engine) as session:
    firms = session.scalars(select(Website).limit(4)).all()
    color_frequency_maps = [get_pixel_frequency_for_website(f) for f in firms]
    graph_popular_colors_by_primary_state(firms, color_frequency_maps)
No description has been provided for this image

Let's run a test over a sample of 100 sites and see what type of results we get!

In [226]:
with Session(engine) as session:
    firms = session.scalars(select(Website).limit(100)).all()
    color_frequency_maps = []
    for f in firms:
        if f.has_frequency_calculation_failure:
            continue

        try:
            color_frequency_maps.append(get_pixel_frequency_for_website(f))
        except Exception as e:
            print(f'got exception calculating pixel frequency for firm {f.domain} ({e})')
            continue
            
    graph_most_prevalent_color(color_frequency_maps)
    graph_most_common_color_combinations(color_frequency_maps)
    graph_popular_colors_by_practice_area(firms, color_frequency_maps)    
    graph_popular_colors_by_primary_state(firms, color_frequency_maps)    
fetching URL https://mcmanuslawfirm.com/ to take a screenshot
calculated 7682688 total pixels in the image
fetching URL https://kttlaw.com/ to take a screenshot
calculated 8767872 total pixels in the image
fetching URL https://www.mcnallywi.com/ to take a screenshot
calculated 2835648 total pixels in the image
fetching URL https://www.davidamahlesq.com/ to take a screenshot
calculated 14373504 total pixels in the image
fetching URL https://www.thecowtownlawyer.com to take a screenshot
calculated 10948608 total pixels in the image
fetching URL https://www.baerlawoffice.com/ to take a screenshot
calculated 10577088 total pixels in the image
fetching URL https://www.meshbesherlawfirm.com/ to take a screenshot
calculated 1384128 total pixels in the image
fetching URL https://www.leattys.com/ to take a screenshot
calculated 5825088 total pixels in the image
fetching URL https://beyourvoice.com/ to take a screenshot
calculated 2802816 total pixels in the image
fetching URL https://www.rockylawfirm.com/ to take a screenshot
calculated 2013120 total pixels in the image
fetching URL https://desmondlawfirm.com to take a screenshot
calculated 2502330 total pixels in the image
fetching URL https://www.nunneleyfamilylaw.com to take a screenshot
got exception calculating pixel frequency for firm www.nunneleyfamilylaw.com (Message: unknown error: net::ERR_SSL_PROTOCOL_ERROR
  (Session info: chrome=134.0.6998.119)
Stacktrace:
0   chromedriver                        0x00000001010076c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x0000000100fffc9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x0000000100b51e30 cxxbridge1$string$len + 92928
3   chromedriver                        0x0000000100b4a3c4 cxxbridge1$string$len + 61588
4   chromedriver                        0x0000000100b3c5ac cxxbridge1$string$len + 4732
5   chromedriver                        0x0000000100b3df8c cxxbridge1$string$len + 11356
6   chromedriver                        0x0000000100b3ca14 cxxbridge1$string$len + 5860
7   chromedriver                        0x0000000100b3c388 cxxbridge1$string$len + 4184
8   chromedriver                        0x0000000100b3c0d4 cxxbridge1$string$len + 3492
9   chromedriver                        0x0000000100b39dec chromedriver + 187884
10  chromedriver                        0x0000000100b3a910 chromedriver + 190736
11  chromedriver                        0x0000000100b54de0 cxxbridge1$string$len + 105136
12  chromedriver                        0x0000000100bdb118 cxxbridge1$string$len + 654824
13  chromedriver                        0x0000000100bda5f8 cxxbridge1$string$len + 651976
14  chromedriver                        0x0000000100b8d2fc cxxbridge1$string$len + 335820
15  chromedriver                        0x0000000100fcc6c4 cxxbridge1$str$ptr + 2549544
16  chromedriver                        0x0000000100fcf988 cxxbridge1$str$ptr + 2562540
17  chromedriver                        0x0000000100fac71c cxxbridge1$str$ptr + 2418560
18  chromedriver                        0x0000000100fd01e8 cxxbridge1$str$ptr + 2564684
19  chromedriver                        0x0000000100f9d750 cxxbridge1$str$ptr + 2357172
20  chromedriver                        0x0000000100feff58 cxxbridge1$str$ptr + 2695100
21  chromedriver                        0x0000000100ff00e0 cxxbridge1$str$ptr + 2695492
22  chromedriver                        0x0000000100fff910 cxxbridge1$str$ptr + 2759028
23  libsystem_pthread.dylib             0x0000000186d91034 _pthread_start + 136
24  libsystem_pthread.dylib             0x0000000186d8be3c thread_start + 8
)
fetching URL https://www.jcdelaw.com/ to take a screenshot
calculated 5142528 total pixels in the image
fetching URL https://www.gmglawfirm.com/ to take a screenshot
calculated 9825408 total pixels in the image
fetching URL https://madialaw.com/ to take a screenshot
calculated 8239104 total pixels in the image
fetching URL https://gainsburghbenjamin.com/ to take a screenshot
got exception calculating pixel frequency for firm https://gainsburghbenjamin.com/ (Message: unknown error: unhandled inspector error: {"code":-32000,"message":"Cannot take screenshot with 0 height."}
  (Session info: chrome=134.0.6998.119)
Stacktrace:
0   chromedriver                        0x0000000100cd36c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x0000000100ccbc9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x000000010081de30 cxxbridge1$string$len + 92928
3   chromedriver                        0x00000001008077a4 cxxbridge1$string$len + 1140
4   chromedriver                        0x0000000100805dc4 chromedriver + 187844
5   chromedriver                        0x0000000100806a44 chromedriver + 191044
6   chromedriver                        0x000000010082acc8 cxxbridge1$string$len + 145816
7   chromedriver                        0x0000000100862470 cxxbridge1$string$len + 373056
8   chromedriver                        0x000000010085a818 cxxbridge1$string$len + 341224
9   chromedriver                        0x00000001008a65f8 cxxbridge1$string$len + 651976
10  chromedriver                        0x00000001008592fc cxxbridge1$string$len + 335820
11  chromedriver                        0x0000000100c986c4 cxxbridge1$str$ptr + 2549544
12  chromedriver                        0x0000000100c9b988 cxxbridge1$str$ptr + 2562540
13  chromedriver                        0x0000000100c7871c cxxbridge1$str$ptr + 2418560
14  chromedriver                        0x0000000100c9c1e8 cxxbridge1$str$ptr + 2564684
15  chromedriver                        0x0000000100c69750 cxxbridge1$str$ptr + 2357172
16  chromedriver                        0x0000000100cbbf58 cxxbridge1$str$ptr + 2695100
17  chromedriver                        0x0000000100cbc0e0 cxxbridge1$str$ptr + 2695492
18  chromedriver                        0x0000000100ccb910 cxxbridge1$str$ptr + 2759028
19  libsystem_pthread.dylib             0x0000000186d91034 _pthread_start + 136
20  libsystem_pthread.dylib             0x0000000186d8be3c thread_start + 8
)
fetching URL https://www.gnclaw.com/ to take a screenshot
calculated 1209600 total pixels in the image
fetching URL https://www.meyers-flowers.com/ to take a screenshot
calculated 21326976 total pixels in the image
fetching URL https://www.malmanlaw.com/ to take a screenshot
calculated 9704448 total pixels in the image
fetching URL https://www.daveabels.com/ to take a screenshot
got exception calculating pixel frequency for firm https://www.daveabels.com/ (Message: unknown error: unhandled inspector error: {"code":-32000,"message":"Unable to capture screenshot"}
  (Session info: chrome=134.0.6998.119)
Stacktrace:
0   chromedriver                        0x00000001005e36c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x00000001005dbc9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x000000010012de30 cxxbridge1$string$len + 92928
3   chromedriver                        0x00000001001177a4 cxxbridge1$string$len + 1140
4   chromedriver                        0x0000000100115dc4 chromedriver + 187844
5   chromedriver                        0x0000000100116a44 chromedriver + 191044
6   chromedriver                        0x000000010013acc8 cxxbridge1$string$len + 145816
7   chromedriver                        0x0000000100172470 cxxbridge1$string$len + 373056
8   chromedriver                        0x000000010016a818 cxxbridge1$string$len + 341224
9   chromedriver                        0x00000001001b65f8 cxxbridge1$string$len + 651976
10  chromedriver                        0x00000001001692fc cxxbridge1$string$len + 335820
11  chromedriver                        0x00000001005a86c4 cxxbridge1$str$ptr + 2549544
12  chromedriver                        0x00000001005ab988 cxxbridge1$str$ptr + 2562540
13  chromedriver                        0x000000010058871c cxxbridge1$str$ptr + 2418560
14  chromedriver                        0x00000001005ac1e8 cxxbridge1$str$ptr + 2564684
15  chromedriver                        0x0000000100579750 cxxbridge1$str$ptr + 2357172
16  chromedriver                        0x00000001005cbf58 cxxbridge1$str$ptr + 2695100
17  chromedriver                        0x00000001005cc0e0 cxxbridge1$str$ptr + 2695492
18  chromedriver                        0x00000001005db910 cxxbridge1$str$ptr + 2759028
19  libsystem_pthread.dylib             0x0000000186d91034 _pthread_start + 136
20  libsystem_pthread.dylib             0x0000000186d8be3c thread_start + 8
)
fetching URL https://www.thehigginsfirm.com/ to take a screenshot
got exception calculating pixel frequency for firm https://www.thehigginsfirm.com/ (Message: unknown error: unhandled inspector error: {"code":-32000,"message":"Unable to capture screenshot"}
  (Session info: chrome=134.0.6998.119)
Stacktrace:
0   chromedriver                        0x00000001027f36c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x00000001027ebc9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x000000010233de30 cxxbridge1$string$len + 92928
3   chromedriver                        0x00000001023277a4 cxxbridge1$string$len + 1140
4   chromedriver                        0x0000000102325dc4 chromedriver + 187844
5   chromedriver                        0x0000000102326a44 chromedriver + 191044
6   chromedriver                        0x000000010234acc8 cxxbridge1$string$len + 145816
7   chromedriver                        0x0000000102382470 cxxbridge1$string$len + 373056
8   chromedriver                        0x000000010237a818 cxxbridge1$string$len + 341224
9   chromedriver                        0x00000001023c65f8 cxxbridge1$string$len + 651976
10  chromedriver                        0x00000001023792fc cxxbridge1$string$len + 335820
11  chromedriver                        0x00000001027b86c4 cxxbridge1$str$ptr + 2549544
12  chromedriver                        0x00000001027bb988 cxxbridge1$str$ptr + 2562540
13  chromedriver                        0x000000010279871c cxxbridge1$str$ptr + 2418560
14  chromedriver                        0x00000001027bc1e8 cxxbridge1$str$ptr + 2564684
15  chromedriver                        0x0000000102789750 cxxbridge1$str$ptr + 2357172
16  chromedriver                        0x00000001027dbf58 cxxbridge1$str$ptr + 2695100
17  chromedriver                        0x00000001027dc0e0 cxxbridge1$str$ptr + 2695492
18  chromedriver                        0x00000001027eb910 cxxbridge1$str$ptr + 2759028
19  libsystem_pthread.dylib             0x0000000186d91034 _pthread_start + 136
20  libsystem_pthread.dylib             0x0000000186d8be3c thread_start + 8
)
fetching URL https://www.alpertfellowslaw.com/ to take a screenshot
calculated 1384128 total pixels in the image
fetching URL https://www.darrenedlaw.com/ to take a screenshot
calculated 14259456 total pixels in the image
fetching URL https://www.skillernfirm.com to take a screenshot
calculated 14277172 total pixels in the image
fetching URL https://www.ohalloransimmons.com to take a screenshot
calculated 746432 total pixels in the image
fetching URL https://socklaw.com to take a screenshot
calculated 4574016 total pixels in the image
fetching URL https://www.treyyateslaw.com to take a screenshot
calculated 10094976 total pixels in the image
fetching URL https://www.susanbrownlaw.com to take a screenshot
calculated 6927552 total pixels in the image
fetching URL https://swartzlaw.com/ to take a screenshot
calculated 12382848 total pixels in the image
fetching URL https://fletcherandphillips.com to take a screenshot
calculated 5901120 total pixels in the image
fetching URL http://www.janmulligan.com to take a screenshot
calculated 6336576 total pixels in the image
fetching URL https://www.martindale.com/attorney/robert-j-pecora-112802/ to take a screenshot
calculated 1524096 total pixels in the image
fetching URL http://www.lindadankmanlaw.com to take a screenshot
calculated 25680 total pixels in the image
fetching URL http://www.peachweathers.com to take a screenshot
calculated 6486912 total pixels in the image
fetching URL http://www.ledgerlaw.com to take a screenshot
calculated 15135552 total pixels in the image
fetching URL http://www.braytonlaw.com to take a screenshot
calculated 13944420 total pixels in the image
fetching URL https://www.caemployeelawyer.com/ to take a screenshot
calculated 32832 total pixels in the image
fetching URL https://www.salamatilaw.com to take a screenshot
calculated 17155584 total pixels in the image
fetching URL http://www.dfis-law.com to take a screenshot
calculated 7105536 total pixels in the image
fetching URL http://www.rosensaba.com to take a screenshot
calculated 30123756 total pixels in the image
fetching URL http://www.nieldlaw.com to take a screenshot
calculated 357808 total pixels in the image
fetching URL http://www.meissnerlaw.com to take a screenshot
calculated 1676160 total pixels in the image
fetching URL http://ellisinjurylaw.com to take a screenshot
calculated 15619392 total pixels in the image
fetching URL http://www.efglawyer.com to take a screenshot
calculated 9906624 total pixels in the image
fetching URL http://www.czechandhowell.com to take a screenshot
calculated 2890944 total pixels in the image
fetching URL https://www.larsonlegalservices.com/ to take a screenshot
calculated 9789120 total pixels in the image
fetching URL http://www.rwolaw.com to take a screenshot
calculated 4981824 total pixels in the image
fetching URL https://www.metierlaw.com to take a screenshot
calculated 12569472 total pixels in the image
fetching URL http://www.mcelyealawoffice.com to take a screenshot
calculated 1384128 total pixels in the image
fetching URL https://www.hsdlawfirm.com to take a screenshot
calculated 16391808 total pixels in the image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [161]:
domains = ['https://www.knoxbotelerlaw.com', 'https://www.knrlegal.com', 'https://www.kollislaw.com', 'https://www.konellruggiero.com', 'https://www.konicekdillonlaw.com', 'https://www.kplegal.com/', 'https://www.krhlaw.com/', 'https://www.krwlawyers.com', 'https://www.kvpclaw.com', 'https://www.kylawpractice.com', 'https://www.kylerobbinslaw.com']
color_frequency_maps = []
for domain in domains:
    take_screenshot(domain)
    color_frequency_maps.append(get_pixel_frequency_for_colors(os.path.join(os.getcwd(), 'screenshot.png')))

graph_most_prevalent_color(color_frequency_maps)
graph_most_common_color_combinations(color_frequency_maps)
calculated 25680 total pixels in the image
calculated 24223104 total pixels in the image
calculated 1389312 total pixels in the image
calculated 6931008 total pixels in the image
calculated 1384128 total pixels in the image
calculated 25680 total pixels in the image
calculated 350960 total pixels in the image
calculated 9052992 total pixels in the image
calculated 25680 total pixels in the image
calculated 14746752 total pixels in the image
calculated 6497280 total pixels in the image
No description has been provided for this image
No description has been provided for this image

Small scale testing is looking good! Let's run this for real :)

In [230]:
with Session(engine) as session:
    # 10,000 firms
    firms = session.scalars(select(Website).limit(10000)).all()
    color_frequency_maps = []
    for f in firms:
        if f.has_frequency_calculation_failure:
            continue

        try:
            color_frequency_maps.append(get_pixel_frequency_for_website(f))
        except Exception as e:
            print(f'got exception calculating pixel frequency for firm {f.domain} ({e})')
            continue
            
    graph_most_prevalent_color(color_frequency_maps)
    graph_most_common_color_combinations(color_frequency_maps)
    graph_popular_colors_by_practice_area(firms, color_frequency_maps)    
    graph_popular_colors_by_primary_state(firms, color_frequency_maps)  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image