Difference between revisions of "Tutorial: Python & DOCX II"
(→The MS Word template) |
|||
(2 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
[[User:Thiebaut|D. Thiebaut]] ([[User talk:Thiebaut|talk]]) 04:13, 28 June 2018 (EDT) | [[User:Thiebaut|D. Thiebaut]] ([[User talk:Thiebaut|talk]]) 04:13, 28 June 2018 (EDT) | ||
---- | ---- | ||
+ | <br /> | ||
+ | [[Image:DocxDiplomaOutputMickeyMouse.png|500px|center]] | ||
+ | <br /> | ||
=The Basic Idea= | =The Basic Idea= | ||
Line 8: | Line 11: | ||
* This is a typical Mail-Merge type problem. | * This is a typical Mail-Merge type problem. | ||
* The trick used here is that the MS Word doc diplomas file, containing all the diplomas, starts with 100 (or whatever number is good for you) blank diploma pages, more than the actual number of rows in the csv file, and substitutes the student information in each page. | * The trick used here is that the MS Word doc diplomas file, containing all the diplomas, starts with 100 (or whatever number is good for you) blank diploma pages, more than the actual number of rows in the csv file, and substitutes the student information in each page. | ||
+ | <br /> | ||
+ | =Requirements= | ||
+ | <br /> | ||
+ | * Install the Python docx library using pip3 | ||
+ | * This code uses the csv library that is already part of Python 3 | ||
<br /> | <br /> | ||
=The CSV File= | =The CSV File= | ||
Line 33: | Line 41: | ||
=The Python Script= | =The Python Script= | ||
<br /> | <br /> | ||
+ | ::<source lang="python"> | ||
+ | #! /usr/bin/env python3 | ||
+ | # generateDiplomas.py | ||
+ | # D. Thiebaut | ||
+ | # Inspired by code provided at https://stackoverflow.com/questions/24805671/ | ||
+ | # how-to-use-python-docx-to-replace-text-in-a-word-document-and-save | ||
+ | from __future__ import print_function | ||
+ | import re | ||
+ | from docx import Document | ||
+ | import csv | ||
+ | import os | ||
+ | |||
+ | #======================================================= | ||
+ | # GLOBALS | ||
+ | #======================================================= | ||
+ | # the field in the csv file where the course name is saved. | ||
+ | # the first field is 0 | ||
+ | COURSE_NAME_FIELD = 10 | ||
+ | |||
+ | # the location of the csv file. The program will prompt for | ||
+ | # it if it doesn't find it | ||
+ | CSV_FILE_NAME = "students.csv" | ||
+ | |||
+ | # the file where the diplomas will be saved | ||
+ | # by default, they will go to the Desktop | ||
+ | #DIPLOMAS_FILE = os.getenv("HOME") +"/Desktop/Diplomas_all.docx" | ||
+ | DIPLOMAS_FILE = "Diplomas_all.docx" | ||
+ | |||
+ | # TEMPLATE100 contains 100 blank diplomas with 3 strings defining | ||
+ | # the first name, last name, and course name that should go on each | ||
+ | # diploma. | ||
+ | TEMPLATE100 = '100_diploma_TEMPLATE.docx' | ||
+ | |||
+ | # the list of valid course names. These should appear in Field 10 | ||
+ | # of the csv file. If a student has a course name that doesn't appear | ||
+ | # in the list below, that student will be skipped. | ||
+ | courses=[ "Animation I", "Animation II" ] | ||
+ | |||
+ | # 1 2 3 | ||
+ | # 0123456789012345678901234567890 | ||
+ | # define shorter version of course names, with _ instead of spaces | ||
+ | coursesShort = [ "_".join( k[0:24].split()[0:-1] ) for k in courses ] | ||
+ | |||
+ | # define course dictionary: | ||
+ | # key is course, value is list of participants. Each participant is | ||
+ | # a list of fields, as found in the csv file | ||
+ | CourseParticipants = { } | ||
+ | |||
+ | #======================================================= | ||
+ | # FUNCTIONS | ||
+ | #======================================================= | ||
+ | def getValidFileName( prompt, defaultFileName ): | ||
+ | while True: | ||
+ | try: | ||
+ | f=open( defaultFileName, 'r' ) | ||
+ | f.close() | ||
+ | return defaultFileName | ||
+ | except: | ||
+ | print( prompt ) | ||
+ | defaultFileName = input() | ||
+ | |||
+ | |||
+ | def docx_replace_regex(doc_obj, regex , replace): | ||
+ | '''searches a document (doc_obj) and locates regex, | ||
+ | and replaces it with replace. doc_obj passed by reference | ||
+ | so no need to return it. | ||
+ | ''' | ||
+ | for p in doc_obj.paragraphs: | ||
+ | if regex.search(p.text): | ||
+ | inline = p.runs | ||
+ | # Loop added to work with runs (strings with same style) | ||
+ | for i in range(len(inline)): | ||
+ | if regex.search(inline[i].text): | ||
+ | text = regex.sub(replace, inline[i].text, count=1 ) | ||
+ | inline[i].text = text | ||
+ | return | ||
+ | |||
+ | #for table in doc_obj.tables: | ||
+ | # for row in table.rows: | ||
+ | # for cell in row.cells: | ||
+ | # docx_replace_regex(cell, regex , replace) | ||
+ | |||
+ | |||
+ | def generateDiploma(doc, firstName, lastName, course): | ||
+ | """ | ||
+ | replace the next "33", "44", and "555" words in the | ||
+ | word document by the first name, last name and course name | ||
+ | of the next student. | ||
+ | """ | ||
+ | docx_replace_regex(doc, | ||
+ | re.compile( r"33" ), | ||
+ | firstName ) | ||
+ | docx_replace_regex(doc, | ||
+ | re.compile( r"44" ), | ||
+ | lastName ) | ||
+ | docx_replace_regex(doc, | ||
+ | re.compile( r"555" ), | ||
+ | course ) | ||
+ | |||
+ | def generateAllDiplomas( CourseParticipants ): | ||
+ | ''' | ||
+ | Given the dictionary of all participants, substitute | ||
+ | all the names and courses in the MS Word doc. | ||
+ | ''' | ||
+ | global courses, DIPLOMAS_FILE, TEMPLATE100 | ||
+ | |||
+ | fileName = TEMPLATE100 | ||
+ | fileName = getValidFileName( "What is the path of the MS Word template? ", | ||
+ | fileName ) | ||
+ | doc = Document( fileName ) | ||
+ | |||
+ | # for each course, generate list of participants | ||
+ | for j, course in enumerate( CourseParticipants ): | ||
+ | |||
+ | # if course not valid, skip it | ||
+ | if course not in courses: | ||
+ | print( "*** WARNING ***: invalid course:", course ) | ||
+ | continue | ||
+ | |||
+ | for i,fields in enumerate( CourseParticipants[ course ] ): | ||
+ | lastName,firstName, institution, address1, address2, \ | ||
+ | city, zip, country, phone, email, course, _,_,_ = fields | ||
+ | |||
+ | print( "generating diploma for", firstName, lastName ) | ||
+ | generateDiploma( doc, firstName, lastName, course ) | ||
+ | |||
+ | |||
+ | doc.save( DIPLOMAS_FILE ) | ||
+ | print( "Diplomas saved in", DIPLOMAS_FILE ) | ||
+ | |||
+ | def main(): | ||
+ | global CSV_FILE_NAME | ||
+ | CSV_FILE_NAME = getValidFileName( "What is the full path of the csv file? ", | ||
+ | CSV_FILE_NAME ) | ||
+ | |||
+ | # read the csv file and parse it | ||
+ | with open( CSV_FILE_NAME, newline='' ) as csvfile: | ||
+ | |||
+ | # create a csv reader | ||
+ | csvReader = csv.reader( csvfile, delimiter=',', quotechar='"') | ||
+ | for i, fields in enumerate( csvReader ): | ||
+ | # skip ill formed lines | ||
+ | if len( fields ) <= 5: continue | ||
+ | |||
+ | # skip first line that contains headers | ||
+ | if i==0: continue | ||
+ | |||
+ | # get course from appropriate field | ||
+ | course = fields[COURSE_NAME_FIELD] | ||
+ | |||
+ | # add new participant to list associated with his/her course | ||
+ | try: | ||
+ | CourseParticipants[ course ].append( fields ) | ||
+ | except: | ||
+ | CourseParticipants[ course ] = [ fields ] | ||
+ | |||
+ | generateAllDiplomas( CourseParticipants ) | ||
+ | |||
+ | main() | ||
+ | |||
+ | |||
+ | </source> | ||
<br /> | <br /> | ||
<br /> | <br /> |
Latest revision as of 03:28, 28 June 2018
D. Thiebaut (talk) 04:13, 28 June 2018 (EDT)
Contents
The Basic Idea
- You have a csv file with a list of students taking different workshops. One field of the csv file contains the first name of the student, another field the last name, and a third field, the workshop title taken by the student.
- The python program shown here generates a Word document where each page is a diploma page, with a logo for the school, the first and last names of the student, and the workshop taken by the student.
- This is a typical Mail-Merge type problem.
- The trick used here is that the MS Word doc diplomas file, containing all the diplomas, starts with 100 (or whatever number is good for you) blank diploma pages, more than the actual number of rows in the csv file, and substitutes the student information in each page.
Requirements
- Install the Python docx library using pip3
- This code uses the csv library that is already part of Python 3
The CSV File
Here is an example csv file. Update to match your needs. Note that the csv file contains 4 lines, but that some of them might wrap when displayed.
- students.csv
Last name,First name,Institution,Address 1,Address 2,City,zip,Country,Phone,Email,Course,misc1,misc2,misc3 Mouse, Mickey,U. of Orlando,Disney World,,"Orlando, FL",12345,USA,+1(234)567 89 00,mickeymouse@disney.com,Animation I,null,null,null Mouse, Minie,U. of Orlando,Disney World,,"Orlando, FL",12345,USA,+1(234)567 89 00,miniemouse@disney.com,Animation II,null,null,null Duck, Duffy,U. of Orlando,Disney World,,"Orlando, FL",12345,USA,+1(234)567 89 99,duffyduck@disney.com,Animation I,null,null,null
The MS Word template
- Here is the MS Word document containing 100 (or whatever number is larger than the number of students) blank diplomas.
- 100_diploma_TEMPLATE.docx (this file is zipped. Unzip before using)
The output MS Word file
- Here is the MS Word generated by the Python script (below).
- Diplomas_all.docx (this file is zipped. Unzip before using)
The Python Script
#! /usr/bin/env python3 # generateDiplomas.py # D. Thiebaut # Inspired by code provided at https://stackoverflow.com/questions/24805671/ # how-to-use-python-docx-to-replace-text-in-a-word-document-and-save from __future__ import print_function import re from docx import Document import csv import os #======================================================= # GLOBALS #======================================================= # the field in the csv file where the course name is saved. # the first field is 0 COURSE_NAME_FIELD = 10 # the location of the csv file. The program will prompt for # it if it doesn't find it CSV_FILE_NAME = "students.csv" # the file where the diplomas will be saved # by default, they will go to the Desktop #DIPLOMAS_FILE = os.getenv("HOME") +"/Desktop/Diplomas_all.docx" DIPLOMAS_FILE = "Diplomas_all.docx" # TEMPLATE100 contains 100 blank diplomas with 3 strings defining # the first name, last name, and course name that should go on each # diploma. TEMPLATE100 = '100_diploma_TEMPLATE.docx' # the list of valid course names. These should appear in Field 10 # of the csv file. If a student has a course name that doesn't appear # in the list below, that student will be skipped. courses=[ "Animation I", "Animation II" ] # 1 2 3 # 0123456789012345678901234567890 # define shorter version of course names, with _ instead of spaces coursesShort = [ "_".join( k[0:24].split()[0:-1] ) for k in courses ] # define course dictionary: # key is course, value is list of participants. Each participant is # a list of fields, as found in the csv file CourseParticipants = { } #======================================================= # FUNCTIONS #======================================================= def getValidFileName( prompt, defaultFileName ): while True: try: f=open( defaultFileName, 'r' ) f.close() return defaultFileName except: print( prompt ) defaultFileName = input() def docx_replace_regex(doc_obj, regex , replace): '''searches a document (doc_obj) and locates regex, and replaces it with replace. doc_obj passed by reference so no need to return it. ''' for p in doc_obj.paragraphs: if regex.search(p.text): inline = p.runs # Loop added to work with runs (strings with same style) for i in range(len(inline)): if regex.search(inline[i].text): text = regex.sub(replace, inline[i].text, count=1 ) inline[i].text = text return #for table in doc_obj.tables: # for row in table.rows: # for cell in row.cells: # docx_replace_regex(cell, regex , replace) def generateDiploma(doc, firstName, lastName, course): """ replace the next "33", "44", and "555" words in the word document by the first name, last name and course name of the next student. """ docx_replace_regex(doc, re.compile( r"33" ), firstName ) docx_replace_regex(doc, re.compile( r"44" ), lastName ) docx_replace_regex(doc, re.compile( r"555" ), course ) def generateAllDiplomas( CourseParticipants ): ''' Given the dictionary of all participants, substitute all the names and courses in the MS Word doc. ''' global courses, DIPLOMAS_FILE, TEMPLATE100 fileName = TEMPLATE100 fileName = getValidFileName( "What is the path of the MS Word template? ", fileName ) doc = Document( fileName ) # for each course, generate list of participants for j, course in enumerate( CourseParticipants ): # if course not valid, skip it if course not in courses: print( "*** WARNING ***: invalid course:", course ) continue for i,fields in enumerate( CourseParticipants[ course ] ): lastName,firstName, institution, address1, address2, \ city, zip, country, phone, email, course, _,_,_ = fields print( "generating diploma for", firstName, lastName ) generateDiploma( doc, firstName, lastName, course ) doc.save( DIPLOMAS_FILE ) print( "Diplomas saved in", DIPLOMAS_FILE ) def main(): global CSV_FILE_NAME CSV_FILE_NAME = getValidFileName( "What is the full path of the csv file? ", CSV_FILE_NAME ) # read the csv file and parse it with open( CSV_FILE_NAME, newline='' ) as csvfile: # create a csv reader csvReader = csv.reader( csvfile, delimiter=',', quotechar='"') for i, fields in enumerate( csvReader ): # skip ill formed lines if len( fields ) <= 5: continue # skip first line that contains headers if i==0: continue # get course from appropriate field course = fields[COURSE_NAME_FIELD] # add new participant to list associated with his/her course try: CourseParticipants[ course ].append( fields ) except: CourseParticipants[ course ] = [ fields ] generateAllDiplomas( CourseParticipants ) main()