# code 2 import docx import pandas as pd def getText(filename,response_file): doc = docx.Document(filename) fullText = [] tables = doc.tables data = [] keys = None for table in tables: for i, row in enumerate(table.rows): text = (cell.text for cell in row.cells) if i == 0: keys = tuple(text) continue row_data = dict(zip(keys, text)) data.append(row_data) lst_1 = [] for table in tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: lst_1.append(paragraph.text) bigrams = ['Candidates must', 'Candidates should', 'must provide', 'must enclose', 'must submit', 'required to', 'to include', 'Please outline', 'Please provide', 'should include', 'Give an example', 'needs to', 'should be included', 'please provide', 'Please explain', 'Please complete', 'Please clarify', 'Please identify', 'Please describe', 'Please tell', 'Please give', 'Does your', 'If you are', 'is offered', 'Has your company', 'Is there anything', 'shall provide', 'shall ensure', 'shall describe', 'shall explain', 'The Supplier', 'Please confirm', 'asked to', 'should furnish', 'Please include', 'Specify any', 'Please detail', 'should detail', 'should provide'] ques_lst = [any([i in s for i in bigrams]) for s in lst_1] ques_lst_1 = [lst_1[i] for i in range(0, len(ques_lst)) if ques_lst[i]] ques_lst_2 = set(ques_lst_1) ques_lst_2 = list(ques_lst_2) df = pd.DataFrame({'Questions' : ques_lst_2}) df.head() df.to_excel(response_file, header=True) # NHSBT1306 - RM3733 Further competition Document v1.docx input_file = input('Share the input file to process: ') # C:/Users/ranjanpriy/Desktop/Questions.xlsx response_file = input('Enter the response file path: ' ) getText(input_file, response_file)