csv-handler"
# CSV Handler for Construction Data
## Overview
CSV is the universal exchange format in construction - from scheduling exports to cost databases. This skill handles encoding issues, delimiter detection, and data cleaning.
## Python Implementation
```python
import pandas as pd
import csv
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path
from dataclasses import dataclass
import chardet
@dataclass
class CSVProfile:
"""Profile of CSV file."""
encoding: str
delimiter: str
has_header: bool
row_count: int
column_count: int
columns: List[str]
class ConstructionCSVHandler:
"""Handle CSV files from construction software."""
COMMON_DELIMITERS = [',', ';', '\t', '|']
COMMON_ENCODINGS = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']
def __init__(self):
self.last_profile: Optional[CSVProfile] = None
def detect_encoding(self, file_path: str) -> str:
"""Detect file encoding."""
with open(file_path, 'rb') as f:
raw = f.read(10000)
result = chardet.detect(raw)
return result.get('encoding', 'utf-8') or 'utf-8'
def detect_delimiter(self, file_path: str, encoding: str) -> str:
"""Detect CSV delimiter."""
with open(file_path, 'r', encoding=encoding, errors='replace') as f:
sample = f.read(5000)
# Count occurrences
counts = {d: sample.count(d) for d in self.COMMON_DELIMITERS}
# Return most common that appears consistently
if counts:
return max(counts, key=counts.get)
return ','
def profile_csv(self, file_path: str) -> CSVProfile:
"""Profile CSV file."""
encoding = self.detect_encoding(file_path)
delimiter = self.detect_delimiter(file_path, encoding)
# Read sample
df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter,
nrows=10, on_bad_lines='skip')
has_header = not df.columns[0].replace('.', '').replace('-', '').isdigit()
# Full row count
with open(file_path, 'r', encoding=encoding, errors='replace') as f:
row_count = sum(1 for _ in f) - (1 if has_header else 0)
profile = CSVProfile(
encoding=encoding,
delimiter=delimiter,
has_header=has_header,
row_count=row_count,
column_count=len(df.columns),
columns=list(df.columns)
)
self.last_profile = profile
return profile
def read_csv(self, file_path: str,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
clean: bool = True) -> pd.DataFrame:
"""Read CSV with auto-detection."""
# Auto-detect if not provided
if encoding is None:
encoding = self.detect_encoding(file_path)
if delimiter is None:
delimiter = self.detect_delimiter(file_path, encoding)
# Read with error handling
df = pd.read_csv(
file_path,
encoding=encoding,
delimiter=delimiter,
on_bad_lines='skip',
low_memory=False
)
if clean:
df = self.clean_dataframe(df)
return df
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean construction CSV data."""
# Clean column names
df.columns = [self._clean_column_name(c) for c in df.columns]
# Remove empty rows and columns
df = df.dropna(how='all')
df = df.dropna(axis=1, how='all')
# Strip whitespace from strings
for col in df.select_dtypes(include=['object']):
df[col] = df[col].str.strip() if df[col].dtype == 'object' else df[col]
return df
def _clean_column_name(self, name: str) -> str:
"""Clean column name."""
if not isinstance(name, str):
return str(name)
# Remove special characters, replace spaces
clean = name.strip().lower()
clean = clean.replace(' ', '_').replace('-', '_')
clean = ''.join(c for c in clean if c.isalnum() or c == '_')
return clean
def merge_csvs(self, file_paths: List[str],
on_column: Optional[str] = None) -> pd.DataFrame:
"""Merge multiple CSV files."""
dfs = []
for path in file_paths:
df = self.read_csv(path)
df['_source_file'] = Path(path).name
dfs.append(df)
if not dfs:
return pd.DataFrame()
if on_column and on_column in dfs[0].columns:
result = dfs[0]
for df in dfs[1:]:
result = pd.merge(result, df, on=on_column, how='outer')
return result
return pd.concat(dfs, ignore_index=True)
def split_csv(self, df: pd.DataFrame,
group_column: str,
output_dir: str) -> List[str]:
"""Split CSV by column values."""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
files = []
for value in df[group_column].unique():
subset = df[df[group_column] == value]
filename = f"{group_column}_{value}.csv"
filepath = output_path / filename
subset.to_csv(filepath, index=False)
files.append(str(filepath))
return files
def convert_types(self, df: pd.DataFrame,
type_map: Dict[str, str] = None) -> pd.DataFrame:
"""Convert column types intelligently."""
df = df.copy()
if type_map:
for col, dtype in type_map.items():
if col in df.columns:
try:
df[col] = df[col].astype(dtype)
except:
pass
else:
# Auto-convert
for col in df.columns:
# Try numeric
try:
df[col] = pd.to_numeric(df[col])
continue
except:
pass
# Try datetime
try:
df[col] = pd.to_datetime(df[col])
except:
pass
return df
def export_csv(self, df: pd.DataFrame,
file_path: str,
encoding: str = 'utf-8-sig',
delimiter: str = ',') -> str:
"""Export DataFrame to CSV."""
df.to_csv(file_path, encoding=encoding, sep=delimiter, index=False)
return file_path
# Specialized handlers
class ScheduleCSVHandler(ConstructionCSVHandler):
"""Handler for project schedule CSVs."""
SCHEDULE_COLUMNS = ['task_id', 'task_name', 'start_date', 'end_date',
'duration', 'predecessors', 'resources']
def parse_schedule(self, file_path: str) -> pd.DataFrame:
"""Parse schedule CSV."""
df = self.read_csv(file_path)
# Convert date columns
for col in df.columns:
if 'date' in col.lower() or 'start' in col.lower() or 'end' in col.lower():
try:
df[col] = pd.to_datetime(df[col])
except:
pass
return df
class CostCSVHandler(ConstructionCSVHandler):
"""Handler for cost/estimate CSVs."""
def parse_costs(self, file_path: str) -> pd.DataFrame:
"""Parse cost CSV."""
df = self.read_csv(file_path)
# Find and convert numeric columns
for col in df.columns:
if any(word in col.lower() for word in ['cost', 'price', 'amount', 'total', 'qty', 'quantity']):
df[col] = pd.to_numeric(df[col].replace(r'[\$,]', '', regex=True), errors='coerce')
return df
```
## Quick Start
```python
handler = ConstructionCSVHandler()
# Profile CSV first
profile = handler.profile_csv("export.csv")
print(f"Encoding: {profile.encoding}, Delimiter: '{profile.delimiter}'")
# Read with auto-detection
df = handler.read_csv("export.csv")
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
```
## Common Use Cases
### 1. Merge Multiple Exports
```python
files = ["jan_export.csv", "feb_export.csv", "mar_export.csv"]
merged = handler.merge_csvs(files)
```
### 2. Split by Category
```python
handler.split_csv(df, group_column='category', output_dir='./split_files')
```
### 3. Schedule Import
```python
schedule_handler = ScheduleCSVHandler()
schedule = schedule_handler.parse_schedule("p6_export.csv")
```
## Resources
- **DDC Book**: Chapter 2.1 - Structured Data
标签
skill
ai