Documentation
¶
Overview ¶
Package metadata provides comprehensive metadata extraction from .doc files.
This package implements full SummaryInformation and DocumentSummaryInformation parsing according to the OLE Property Set specification, supporting all standard document properties and custom properties.
Index ¶
Constants ¶
const ( PIDTitle = 0x02 PIDSubject = 0x03 PIDAuthor = 0x04 PIDKeywords = 0x05 PIDComments = 0x06 PIDTemplate = 0x07 PIDLastAuthor = 0x08 PIDRevNumber = 0x09 PIDEditTime = 0x0A PIDLastPrinted = 0x0B PIDCreateTime = 0x0C PIDLastSaveTime = 0x0D PIDPageCount = 0x0E PIDWordCount = 0x0F PIDCharCount = 0x10 PIDThumbnail = 0x11 PIDAppName = 0x12 PIDSecurity = 0x13 )
Property IDs for SummaryInformation stream
const ( PIDCategory = 0x02 PIDPresentationFormat = 0x03 PIDByteCount = 0x04 PIDLineCount = 0x05 PIDParaCount = 0x06 PIDSlideCount = 0x07 PIDNoteCount = 0x08 PIDHiddenCount = 0x09 PIDMMClipCount = 0x0A PIDScale = 0x0B PIDHeadingPairs = 0x0C PIDDocParts = 0x0D PIDManager = 0x0E PIDCompany = 0x0F PIDLinksUpToDate = 0x10 PIDCharCountWithSpaces = 0x11 PIDHyperLinkBase = 0x15 PIDHyperLinks = 0x16 PIDHyperLinksChanged = 0x17 PIDVersion = 0x18 PIDDigSig = 0x19 PIDContentType = 0x1A PIDContentStatus = 0x1B PIDLanguage = 0x1C PIDDocVersion = 0x1D )
Property IDs for DocumentSummaryInformation stream
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type DocumentMetadata ¶
type DocumentMetadata struct {
// Core properties from SummaryInformation
Title string // Document title
Subject string // Document subject
Author string // Document author
Keywords string // Document keywords
Comments string // Document comments
Template string // Template name
LastAuthor string // Last saved by
RevisionNumber string // Revision number
ApplicationName string // Creating application
Created time.Time // Creation time
LastSaved time.Time // Last saved time
LastPrinted time.Time // Last printed time
TotalEditTime int64 // Total editing time in minutes
PageCount int32 // Number of pages
WordCount int32 // Number of words
CharCount int32 // Number of characters
CharCountWithSpaces int32 // Number of characters with spaces
Security int32 // Security flags
Category string // Document category
PresentationFormat string // Presentation format
ByteCount int64 // Number of bytes
LineCount int32 // Number of lines
ParagraphCount int32 // Number of paragraphs
SlideCount int32 // Number of slides
NoteCount int32 // Number of notes
HiddenSlideCount int32 // Number of hidden slides
MultimediaClipCount int32 // Number of multimedia clips
// Document summary properties
Company string // Company name
Manager string // Manager name
Language int32 // Document language
DocumentVersion string // Document version
ContentType string // Content type
ContentStatus string // Content status
HyperLinkBase string // Hyperlink base
CustomProperties map[string]interface{} // Custom properties
// Extended properties
ThumbnailClipboardFormat int32 // Thumbnail format
ThumbnailData []byte // Thumbnail image data
// Security and protection
ReadOnlyRecommended bool // Read-only recommended
WriteReservationPassword bool // Write reservation password set
ReadOnlyPassword bool // Read-only password set
}
DocumentMetadata holds comprehensive document metadata information.
func (*DocumentMetadata) GetLanguageName ¶
func (metadata *DocumentMetadata) GetLanguageName() string
GetLanguageName returns the human-readable language name for a language code.
func (*DocumentMetadata) IsProtected ¶
func (metadata *DocumentMetadata) IsProtected() bool
IsProtected returns true if the document has any protection enabled.
type MetadataExtractor ¶
type MetadataExtractor struct {
// contains filtered or unexported fields
}
MetadataExtractor handles extraction of metadata from .doc files.
func NewMetadataExtractor ¶
func NewMetadataExtractor(reader *ole2.Reader) *MetadataExtractor
NewMetadataExtractor creates a new metadata extractor.
func (*MetadataExtractor) ExtractMetadata ¶
func (me *MetadataExtractor) ExtractMetadata() (*DocumentMetadata, error)
ExtractMetadata extracts complete metadata from the document.
type PropertyType ¶
type PropertyType uint16
PropertyType represents the data type of a property.
const ( PropertyTypeEmpty PropertyType = 0x0000 // VT_EMPTY PropertyTypeNull PropertyType = 0x0001 // VT_NULL PropertyTypeInt16 PropertyType = 0x0002 // VT_I2 PropertyTypeInt32 PropertyType = 0x0003 // VT_I4 PropertyTypeFloat PropertyType = 0x0004 // VT_R4 PropertyTypeDouble PropertyType = 0x0005 // VT_R8 PropertyTypeCurrency PropertyType = 0x0006 // VT_CY PropertyTypeDate PropertyType = 0x0007 // VT_DATE PropertyTypeString PropertyType = 0x0008 // VT_BSTR PropertyTypeBoolean PropertyType = 0x000B // VT_BOOL PropertyTypeVariant PropertyType = 0x000C // VT_VARIANT PropertyTypeInt8 PropertyType = 0x0010 // VT_I1 PropertyTypeUInt8 PropertyType = 0x0011 // VT_UI1 PropertyTypeUInt16 PropertyType = 0x0012 // VT_UI2 PropertyTypeUInt32 PropertyType = 0x0013 // VT_UI4 PropertyTypeInt64 PropertyType = 0x0014 // VT_I8 PropertyTypeUInt64 PropertyType = 0x0015 // VT_UI8 PropertyTypeFileTime PropertyType = 0x0040 // VT_FILETIME PropertyTypeBlob PropertyType = 0x0041 // VT_BLOB PropertyTypeClipboardData PropertyType = 0x0047 // VT_CF PropertyTypeStringA PropertyType = 0x001E // VT_LPSTR PropertyTypeStringW PropertyType = 0x001F // VT_LPWSTR )