Category: Python

Python: Listing XML tags

I was having a lot of trouble using find/findall when parsing an XML document — turns out the namespace prefixed the tag name … so I needed to find {http://maven.apache.org/POM/4.0.0}groupId instead of just groupId

How do you figure that out? Quickest way, for me, was just to print out all of the tag names.

from lxml import etree
# Load POM XML into tree
tree = etree.parse( strXMLFile )

# # List all element names in XML document
for element in tree.iter():
     print(element.tag)

Python: Generate Transcript of Video File

There’s a speech_recognition module in Python that transcribes an audio file — since ffmpeg can convert a video file to mp3, that means you can also use Python to transcribe a video file.

# requires pocketsphinx from CMU if using sphinx for speech to text recognition
import os
import speech_recognition as sr
import ffmpeg

strFFMPEGBinaryLocation = 'c:/tmp/ffmpeg/bin/ffmpeg.exe'
strCurrentDirectory = os.getcwd()

strInputVideo = "\"Z:/Path To/My Video/file.MP4\""
strOutputFileName = "converted.wav"
# Convert mp4 to wav file
strffmpeg_convert_mp4_to_wav = f'{strFFMPEGBinaryLocation} -i {strInputVideo} {strCurrentDirectory}/{strOutputFileName}'
os.system(strffmpeg_convert_mp4_to_wav)

# Run converted wav file through speech recognizer
r = sr.Recognizer()
audio = sr.AudioFile(f'{strCurrentDirectory}/{strOutputFileName}')

with audio as source:
	#audio = r.record(source, 90)				# Would need API key to process longer audio?
	#text = r.recognize_google(audio)
	audio = r.record(source)
	text = r.recognize_sphinx(audio)
print(text)

Python Code — Creating Title Images

Instead of allowing YouTube to randomly pick a frame to use as the preview image, I have always made a title image for the Township meetings I post to YouTube. At first, this was a manual (and thus time consuming for a lot of videos). In the interim, I have created a script that generates the color gradient background and overlays text including the meeting type and date.

# Valid meeting types: "TrusteeRegular",  "TrusteeSpecial", "TrusteeEmer", "TrusteeHearing", "BZAReg", "BZAHearing", "ZCReg", "ZCHearing"
strMeetingListSpreadsheet = 'MeetingList.xlsx'

from PIL import Image, ImageDraw, ImageFont
import pandas as pd

BLACK= (0,0,0)
WHITE = (255,255,255)

TRUSTEE_COLOR_PALETTE = [(156,12,12), (92,7,7), (0,0,0)]
BZA_COLOR_PALETTE = [(253,139,1), (91,51,0), (0,0,0)]
ZC_COLOR_PALETTE = [(24,113,56), (8,41,20), (0,0,0)]
MISC_COLOR_PALETTE = [(175,28,195), (55,9,61), (0,0,0)]

objFontMeetingTitle = ImageFont.truetype("/usr/share/fonts/liberation-sans/LiberationSans-Regular.ttf",115)
objFontMeetingTopic = ImageFont.truetype("/usr/share/fonts/liberation-sans/LiberationSans-Regular.ttf",115)
objFontMeetingDate = ImageFont.truetype("/usr/share/fonts/liberation-sans/LiberationSans-Italic.ttf",95)

class Point(object):
    def __init__(self, x, y):
        self.x, self.y = x, y

class Rect(object):
    def __init__(self, x1, y1, x2, y2):
        minx, maxx = (x1,x2) if x1 < x2 else (x2,x1)
        miny, maxy = (y1,y2) if y1 < y2 else (y2,y1)
        self.min = Point(minx, miny)
        self.max = Point(maxx, maxy)

    width  = property(lambda self: self.max.x - self.min.x)
    height = property(lambda self: self.max.y - self.min.y)

def gradient_color(minval, maxval, val, color_palette):
    """ Computes intermediate RGB color of a value in the range of minval
        to maxval (inclusive) based on a color_palette representing the range.
    """
    max_index = len(color_palette)-1
    delta = maxval - minval
    if delta == 0:
        delta = 1
    v = float(val-minval) / delta * max_index
    i1, i2 = int(v), min(int(v)+1, max_index)
    (r1, g1, b1), (r2, g2, b2) = color_palette[i1], color_palette[i2]
    f = v - i1
    return int(r1 + f*(r2-r1)), int(g1 + f*(g2-g1)), int(b1 + f*(b2-b1))

def horz_gradient(draw, rect, color_func, color_palette):
    minval, maxval = 1, len(color_palette)
    delta = maxval - minval
    width = float(rect.width)  # Cache.
    for x in range(rect.min.x, rect.max.x+1):
        f = (x - rect.min.x) / width
        val = minval + f * delta
        color = color_func(minval, maxval, val, color_palette)
        draw.line([(x, rect.min.y), (x, rect.max.y)], fill=color)

def vert_gradient(draw, rect, color_func, color_palette):
    minval, maxval = 1, len(color_palette)
    delta = maxval - minval
    height = float(rect.height)  # Cache.
    for y in range(rect.min.y, rect.max.y+1):
        f = (y - rect.min.y) / height
        val = minval + f * delta
        color = color_func(minval, maxval, val, color_palette)
        draw.line([(rect.min.x, y), (rect.max.x, y)], fill=color)


if __name__ == '__main__':
    df = pd.read_excel(strMeetingListSpreadsheet, sheet_name="Sheet1")

    df = df.reset_index()  # make sure indexes pair with number of rows

    for index, row in df.iterrows():
        strGraphicName = f"{row['Date'].strftime('%Y%d%m')}-{row['Type']}.png"
        strMeetingType = row['Type']

        # Draw a three color horizontal gradient.
        region = Rect(0, 0, 1920, 1080)
        width, height = region.max.x+1, region.max.y+1
        image = Image.new("RGB", (width, height), BLACK)
        draw = ImageDraw.Draw(image)

        # Add meeting title
        if strMeetingType == "TrusteeRegular":
            horz_gradient(draw, region, gradient_color, TRUSTEE_COLOR_PALETTE)
            draw.text((1670, 525),"Trustee Regular Meeting",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "TrusteeSpecial":
            horz_gradient(draw, region, gradient_color, TRUSTEE_COLOR_PALETTE)
            draw.text((1670, 525),"Trustee Special Meeting",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "TrusteeEmer":
            horz_gradient(draw, region, gradient_color, TRUSTEE_COLOR_PALETTE)
            draw.text((1670, 525),"Trustee Emergency Meeting",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "TrusteeHearing":
            horz_gradient(draw, region, gradient_color, TRUSTEE_COLOR_PALETTE)
            draw.text((1670, 525),"Trustee Public Hearing",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "BZAReg":
            horz_gradient(draw, region, gradient_color, BZA_COLOR_PALETTE)
            draw.text((1670, 525),"BZA Regular Meeting",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "BZAHearing":
            horz_gradient(draw, region, gradient_color, BZA_COLOR_PALETTE)
            draw.text((1670, 525),"BZA Public Hearing",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "ZCReg":
            horz_gradient(draw, region, gradient_color, ZC_COLOR_PALETTE)
            draw.text((1670, 525),"Zoning Commission Meeting",WHITE,font=objFontMeetingTopic, anchor="rm")
        elif strMeetingType == "ZCHearing":
            horz_gradient(draw, region, gradient_color, ZC_COLOR_PALETTE)
            draw.text((1670, 525),"Zoning Commission Hearing",WHITE,font=objFontMeetingTopic, anchor="rm")
        else:
            horz_gradient(draw, region, gradient_color, MISC_COLOR_PALETTE)
            draw.text((1670, 525),"Township Meeting",WHITE,font=objFontMeetingTopic, anchor="rm")

        # Add township and date
        draw.text((1070, 225),"Hinckley Township",WHITE,font=objFontMeetingTitle, anchor="rm")
        draw.text((1770, 825),row['Date'].strftime('%B %d, %Y'),WHITE,font=objFontMeetingDate, anchor="rm")

        image.save(strGraphicName, "PNG")
        print(f"image saved as {strGraphicName}")


I have an Excel file which contains the meeting type code, a long meeting title that is used as the second line of the image, a date (and a MeetingDate that I use in my concat formulae that create the title and description for YouTube). To use an Excel date in concat, you need to use a TEXT formula with the text formatting string.

This allows me to have a consistent preview image for all of our postings without actually making dozens of files by hand.

Bulk Download of YouTube Videos from Channel

Several years ago, I started recording our Township meetings and posting them to YouTube. This was very helpful — even our government officials used the recordings to refresh their memory about what happened in a meeting. But it also led people to ask “why, exactly, are we relying on some random citizen to provide this service? What if they are busy? Or move?!” … and the Township created their own channel and posted their meeting recordings. This was a great way to promote transparency however they’ve got retention policies. Since we have absolutely been at meetings where it would be very helpful to know what happened five, ten, forty!! years ago … my expectation is that these videos will be useful far beyond the allotted document retention period.

We decided to keep our channel around with the historic archive of government meeting recordings. There’s no longer time criticality — anyone who wants to see a current meeting can just use the township’s channel. We have a script that lists all of the videos from the township’s channel and downloads them — once I complete back-filling our archive, I will modify the script to stop once it reaches a video series we already have. But this quick script will list all videos published to a channel and download the highest quality MP4 file associated with that video.

# API key for my Google Developer project
strAPIKey = '<CHANGEIT>'

# Youtube account channel ID
strChannelID = '<CHANGEIT>'

import os
from time import sleep
import urllib
from urllib.request import urlopen
import json
from pytube import YouTube
import datetime

from config import dateLastDownloaded

os.chdir(os.path.dirname(os.path.abspath(__file__)))
print(os.getcwd())

strBaseVideoURL = 'https://www.youtube.com/watch?v='
strSearchAPIv3URL= 'https://www.googleapis.com/youtube/v3/search?'

iStart = 0		# Not used -- included to allow skipping first N files when batch fails midway
iProcessed = 0		# Just a counter

strStartURL = f"{strSearchAPIv3URL}key={strAPIKey}&channelId={strChannelID}&part=snippet,id&order=date&maxResults=50"
strYoutubeURL = strStartURL

while True:
    inp = urllib.request.urlopen(strYoutubeURL)
    resp = json.load(inp)

    for i in resp['items']:
        if i['id']['kind'] == "youtube#video":
            iDaysSinceLastDownload = datetime.datetime.strptime(i['snippet']['publishTime'], "%Y-%m-%dT%H:%M:%SZ") - dateLastDownloaded
            # If video was posted since last run time, download the video
            if iDaysSinceLastDownload.days >= 0:
                strFileName = (i['snippet']['title']).replace('/','-').replace(' ','_')
                print(f"{iProcessed}\tDownloading file {strFileName} from {strBaseVideoURL}{i['id']['videoId']}")
                # Need to retrieve a youtube object and filter for the *highest* resolution otherwise we get blurry videos
                if iProcessed >= iStart:
                    yt = YouTube(f"{strBaseVideoURL}{i['id']['videoId']}")
                    yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{strFileName}.mp4")
                    sleep(90)
                iProcessed = iProcessed + 1
    try:
        next_page_token = resp['nextPageToken']
        strYoutubeURL = strStartURL + '&pageToken={}'.format(next_page_token)
        print(f"Now getting next page from {strYoutubeURL}")
    except:
        break

# Update config.py with last run date
f = open("config.py","w")
f.write("import datetime\n")
f.write(f"dateLastDownloaded = datetime.datetime({datetime.datetime.now().year},{datetime.datetime.now().month},{datetime.datetime.now().day},0,0,0)")
f.close