source: rattail/rattail/excel.py @ 2e37e80

Last change on this file since 2e37e80 was 2e37e80, checked in by Lance Edgar <lance@…>, 7 months ago

Add basic date parsing for ExcelReaderXLSX

at least *try* to parse, if given a string value

  • Property mode set to 100644
File size: 11.8 KB
Line 
1# -*- coding: utf-8; -*-
2################################################################################
3#
4#  Rattail -- Retail Software Framework
5#  Copyright © 2010-2022 Lance Edgar
6#
7#  This file is part of Rattail.
8#
9#  Rattail is free software: you can redistribute it and/or modify it under the
10#  terms of the GNU General Public License as published by the Free Software
11#  Foundation, either version 3 of the License, or (at your option) any later
12#  version.
13#
14#  Rattail is distributed in the hope that it will be useful, but WITHOUT ANY
15#  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16#  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
17#  details.
18#
19#  You should have received a copy of the GNU General Public License along with
20#  Rattail.  If not, see <http://www.gnu.org/licenses/>.
21#
22################################################################################
23"""
24Excel utilities
25"""
26
27from __future__ import unicode_literals, absolute_import
28
29import datetime
30
31import six
32import xlrd
33from xlrd.xldate import xldate_as_tuple
34
35import openpyxl
36from openpyxl.styles import Font, PatternFill
37from openpyxl.cell.cell import get_column_letter
38
39from rattail.util import progress_loop
40
41
42class ExcelReaderXLS(object):
43    """
44    Basic class for reading Excel "legacy" (.xls) files.
45
46    Uses the ``xlrd`` package to read the files.
47    """
48
49    def __init__(self, path, sheet=0, sheet_name=None, header=0,
50                 first_data_row=None,
51                 datefmt='%Y-%m-%d', strip_fieldnames=True):
52        """
53        Constructor; opens an Excel file for reading.
54
55        :param header: Which row should be used as the header, i.e. to
56           determine field (column) names.  This is a zero-based index, so is 0
57           by default (i.e. the first row).
58
59        :param first_data_row: Which is the first row to contain data.  If not
60           specified, it will be assumed that data rows begin immediately after
61           the header row, as defined by :param:`header`.  This again is
62           zero-based, so if the very first row is the true header, but then
63           there is another "header" row also, you might specify a value of
64           ``2`` here, since the 3rd row is the first to contain data.
65        """
66        self.book = xlrd.open_workbook(path)
67        if sheet_name is not None:
68            self.sheet = self.book.sheet_by_name(sheet_name)
69        else:
70            self.sheet = self.book.sheet_by_index(sheet)
71        self.header = header
72        if first_data_row is not None:
73            self.first_data_row = first_data_row
74        else:
75            self.first_data_row = self.header + 1
76        self.fields = self.sheet.row_values(self.header)
77        if strip_fieldnames:
78            self.fields = [field.strip() for field in self.fields]
79        self.datefmt = datefmt
80
81    def sheet_by_name(self, name):
82        return self.book.sheet_by_name(name)
83
84    def read_rows(self, progress=None):
85        rows = []
86
87        def append(row, i):
88            values = self.sheet.row_values(row)
89            data = dict([(self.fields[j], value)
90                         for j, value in enumerate(values)])
91            rows.append(data)
92
93        progress_loop(append, range(self.first_data_row, self.sheet.nrows), progress,
94                      message="Reading data from Excel file")
95        return rows
96
97    def parse_date(self, value, fmt=None):
98        if isinstance(value, float):
99            args = xldate_as_tuple(value, self.book.datemode)
100            return datetime.datetime(*args).date()
101        if value:
102            return datetime.datetime.strptime(value, fmt or self.datefmt).date()
103
104# TODO: this should become a base class and/or wrapper of some sort. for now
105# the "default" reader assumes XLS since apparently that is all we have ever
106# supported until now...which was a surprise to me...
107ExcelReader = ExcelReaderXLS
108
109
110class ExcelReaderXLSX(object):
111    """
112    Basic class for reading Excel 2010 (.xslx) files.
113
114    Uses the `openpyxl`_ package to read the files.
115
116    .. _openpyxl: https://openpyxl.readthedocs.io/en/stable/
117
118    :param path: Path to the Excel data file.
119
120    :param header_row: 1-based row number which contains the header,
121       with field names.
122
123    :param strip_fieldnames: If true (the default), any whitespace
124       surrounding the field names will be stripped, i.e. after they
125       are read from the header row.  Pass ``False`` here to suppress
126       the behavior and leave whitespace intact.
127    """
128
129    def __init__(self, path, header_row=1, strip_fieldnames=True,
130                 datefmt='%Y-%m-%d', **kwargs):
131        """
132        Constructor; opens an Excel file for reading.
133
134        :param path: Path to the Excel file.
135
136        :param header_row: Which row contains the column headers.  This is
137           1-based, so the 1 is the default (i.e. the first row).
138        """
139        # nb. after much use with no problems, eventually did come
140        # across a spreadsheet which contained formula instead of
141        # values for certain cells.  so now using ``data_only=True``
142        # to avoid the formula, hopefully nothing else breaks..
143        self.book = openpyxl.load_workbook(filename=path, data_only=True)
144        self.sheet = self.book.active
145
146        self.header_row = header_row
147        self.datefmt = datefmt
148
149        self.fields = None
150        # TODO: this seems like a hacky way to get the header fields?  we are
151        # "iterating" over the single header row, effectively...
152        for row in self.sheet.iter_rows(min_row=self.header_row,
153                                        max_row=self.header_row,
154                                        values_only=True):
155            assert self.fields is None
156            self.fields = [field for field in row
157                           if field is not None]
158
159        if strip_fieldnames:
160            self.fields = [field.strip() for field in self.fields]
161
162    def iter_rows(self):
163        return self.sheet.iter_rows(min_row=self.header_row + 1,
164                                    values_only=True)
165
166    def read_rows(self, progress=None):
167        fieldcount = len(self.fields)
168        rows = []
169
170        def append(row, i):
171            data = {}
172            for j, value in enumerate(row):
173                if j < fieldcount:
174                    data[self.fields[j]] = value
175            rows.append(data)
176
177        xlrows = list(self.iter_rows())
178        progress_loop(append, xlrows, progress,
179                      message="Reading data from Excel file")
180        return rows
181
182    # TODO: this is here just for method signature compatibility with
183    # the older xlrd-based reader above.  maybe should just deprecate
184    # and/or remove it though
185    def parse_date(self, value, fmt=None):
186        if isinstance(value, six.string_types):
187            if not value:
188                return
189            return datetime.datetime.strptime(value, fmt or self.datefmt).date()
190        return value
191
192
193class ExcelWriter(object):
194    """
195    Base class for Excel writers.
196    """
197
198    def __init__(self, path, fields, sheet_title=None, number_formats={}):
199        """
200        Constructor; opens an Excel workbook for writing.
201        """
202        self.path = path
203        self.fields = fields
204        self.book = openpyxl.Workbook()
205        self.sheet = self.book.active
206        if sheet_title:
207            self.sheet.title = sheet_title
208        self.number_formats = number_formats
209
210    def create_sheet(self, title):
211        """
212        Create a new sheet in the workbook, and make it active.
213        """
214        self.sheet = self.book.create_sheet(title)
215        return self.sheet
216
217    def enable_grid_lines(self, sheet=None, enabled=True):
218        if not sheet:
219            sheet = self.sheet
220        sheet.sheet_view.showGridLines = enabled
221
222    def disable_grid_lines(self, sheet=None):
223        self.enable_grid_lines(sheet=sheet, enabled=False)
224
225    def write_header(self, labels=None):
226        font = Font(bold=True)
227        for i, field in enumerate(self.fields, 1):
228            value = field
229            if labels and field in labels:
230                value = labels[field]
231            cell = self.sheet.cell(row=1, column=i, value=value)
232            cell.font = font
233
234    def write_row(self, data, row=None):
235        """
236        Write (append) a single data row to the current sheet.
237
238        :param row: The 1-based row number to which data should be written.
239        """
240        if row is None:
241            raise NotImplementedError("should be able to detect 'next' row here?")
242
243        self.sheet.append(data)
244
245        # apply number formats
246        if self.number_formats:
247            for col, field in enumerate(self.fields, 1):
248                if field in self.number_formats:
249                    cell = self.sheet.cell(row=row, column=col)
250                    cell.number_format = self.number_formats[field]
251
252        # apply row highlighting
253        if row % 2 == 0:
254            fill_even = PatternFill(patternType='solid',
255                                    fgColor='d9d9d9',
256                                    bgColor='d9d9d9')
257            for col, field in enumerate(self.fields, 1):
258                cell = self.sheet.cell(row=row, column=col)
259                cell.fill = fill_even
260
261    def write_rows(self, rows, progress=None):
262        """
263        Write (append) a sequence of data rows to the current sheet.
264        """
265        def write(data, i):
266            # must add 1 to account for header
267            self.write_row(data, row=i + 1)
268
269        progress_loop(write, rows, progress,
270                      message="Generating cells for spreadsheet")
271
272    def auto_freeze(self, row=2, column=1):
273        """
274        Freeze sheet per "the usual"
275        """
276        self.sheet.freeze_panes = self.sheet.cell(row=row, column=column)
277
278    def auto_filter(self):
279        """
280        Add auto filters for all columns.
281        """
282        first = self.sheet.cell(row=1, column=1)
283        last = self.sheet.cell(row=self.sheet.max_row, column=self.sheet.max_column)
284        cellrange = '{}:{}'.format(first.coordinate, last.coordinate)
285        self.sheet.auto_filter.ref = cellrange
286
287    def auto_resize(self, progress=None):
288        """
289        (Try to) Auto-resize all data columns.
290        """
291        # note, some of the below uses efficiency tricks from these docs
292        # https://openpyxl.readthedocs.io/en/stable/tutorial.html#accessing-many-cells
293
294        # we must calculate desired column widths.  but for sake of progress,
295        # we'll iterate through rows instead of columns, to do that.  (this is
296        # just to give a higher total for the progress bar.)  so the first pass
297        # is really just to cache all existing string widths.
298        cached = []
299
300        def cache(row, i):
301            cached.append([len(six.text_type(value))
302                           for value in row])
303
304        progress_loop(cache, list(self.sheet.values), progress,
305                      message="Calculating all string widths")
306
307        # okay, now can determine ideal widths
308        column_widths = []
309
310        def calculate(col, i):
311            width = max([row_widths[col] for row_widths in cached])
312            column_widths.append(width or 5)
313
314        progress_loop(calculate, range(self.sheet.max_column), progress,
315                      message="Calculating desired column widths")
316
317        # resize columns
318        for i, width in enumerate(column_widths, 1):
319            self.sheet.column_dimensions[get_column_letter(i)].width = width + 3
320
321    def save(self, progress=None):
322        """
323        Save the Excel workbook to file.  If ``progress`` is provided, it will
324        be used in a hacky sort of way, i.e. from 0 to 1 only since we have no
325        way of knowing true progress for the save operation.  (But it can still
326        be nice to let user know this is the step we're on at least.)
327        """
328        def save(x, i):
329            self.book.save(self.path)
330
331        progress_loop(save, range(1), progress,
332                      message="Saving workbook to file")
Note: See TracBrowser for help on using the repository browser.