RosettaCodeData/Task/Text-processing-2/Python/text-processing-2-2.py

46 lines
1.5 KiB
Python

import re
import zipfile
import StringIO
def munge2(readings, debug=False):
datePat = re.compile(r'\d{4}-\d{2}-\d{2}')
valuPat = re.compile(r'[-+]?\d+\.\d+')
statPat = re.compile(r'-?\d+')
totalLines = 0
dupdate, badform, badlen, badreading = set(), set(), set(), 0
datestamps = set([])
for line in readings:
totalLines += 1
fields = line.split('\t')
date = fields[0]
pairs = [(fields[i],fields[i+1]) for i in range(1,len(fields),2)]
lineFormatOk = datePat.match(date) and \
all( valuPat.match(p[0]) for p in pairs ) and \
all( statPat.match(p[1]) for p in pairs )
if not lineFormatOk:
if debug: print 'Bad formatting', line
badform.add(date)
if len(pairs)!=24 or any( int(p[1]) < 1 for p in pairs ):
if debug: print 'Missing values', line
if len(pairs)!=24: badlen.add(date)
if any( int(p[1]) < 1 for p in pairs ): badreading += 1
if date in datestamps:
if debug: print 'Duplicate datestamp', line
dupdate.add(date)
datestamps.add(date)
print 'Duplicate dates:\n ', '\n '.join(sorted(dupdate))
print 'Bad format:\n ', '\n '.join(sorted(badform))
print 'Bad number of fields:\n ', '\n '.join(sorted(badlen))
print 'Records with good readings: %i = %5.2f%%\n' % (
totalLines-badreading, (totalLines-badreading)/float(totalLines)*100 )
print 'Total records: ', totalLines
readings = open('readings.txt','r')
munge2(readings)